Added src files

df67e161 · Stephen Mayhew · 34e25708 · df67e161 · df67e161 · df67e161
Commit df67e161 authored 9 years ago by Stephen Mayhew
--- a/pom.xml
+++ b/pom.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>edu.illinois.cs.cogcomp</groupId>
+    <artifactId>nplm-java</artifactId>
+    <version>1.0-SNAPSHOT</version>
+
+    <repositories>
+        <repository>
+            <id>CogcompSoftware</id>
+            <name>CogcompSoftware</name>
+            <url>http://cogcomp.cs.illinois.edu/m2repo/</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>3.8.1</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>edu.illinois.cs.cogcomp</groupId>
+            <artifactId>illinois-core-utilities</artifactId>
+            <version>1.2.19</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-lang3</artifactId>
+            <version>3.4</version>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.12</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>com.belerweb</groupId>
+            <artifactId>pinyin4j</artifactId>
+            <version>2.5.0</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-log4j12</artifactId>
+            <version>1.7.13</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-math3</artifactId>
+            <version>3.4</version>
+        </dependency>
+
+
+    </dependencies>
+
+</project>
\ No newline at end of file
--- a/src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/NeuralLM.java
+package edu.illinois.cs.cogcomp;
+
+import edu.illinois.cs.cogcomp.core.io.LineIO;
+import org.apache.commons.math3.linear.*;
+import org.apache.commons.math3.random.JDKRandomGenerator;
+import org.apache.commons.math3.random.RandomGenerator;
+import org.apache.commons.math3.random.UniformRandomGenerator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileNotFoundException;
+import java.util.*;
+
+/**
+ * Created by mayhew2 on 11/18/15.
+ */
+public class NeuralLM {
+
+    private static Logger logger = LoggerFactory.getLogger(NeuralLM.class);
+
+    private int output_embedding_dimension;
+    private int n_hidden;
+    private int n_input_vocab;
+    private int n_output_vocab;
+    private int ngram_size;
+    private int input_embedding_dimension;
+
+    private RealMatrix input_embeddings;
+    private RealMatrix hidden1_weights;
+    private RealMatrix hidden2_weights;
+    private RealMatrix output_weights;
+    private RealMatrix output_biases;
+
+    private static RandomGenerator randomgenerator = new JDKRandomGenerator();
+    private static UniformRandomGenerator unif = new UniformRandomGenerator(randomgenerator);
+
+    // vocabulary files
+    private List<String> index_to_word;
+    private HashMap<String, Integer> word_to_index;
+
+    /**
+     * Input:  a and b are arrays.
+     * Output: a column vector of the dot product of the rows of a
+     * and respective columns of b, in other words, diag(a.dot(b))
+     */
+    public static RealVector diag_dot(RealMatrix a, RealMatrix b) {
+        RealMatrix m = a.multiply(b);
+        RealVector out = new ArrayRealVector(m.getRowDimension());
+
+        // I guess that matrix m should be square??
+        for(int i = 0; i < m.getRowDimension(); i++){
+            out.setEntry(i, m.getEntry(i,i));
+        }
+        return out;
+    }
+
+    /**
+     * Sets the matrix m to have uniform random values in the range (-r, r)
+     * @param m input matrix
+     * @param r range parameter
+     */
+    private static void uniform(RealMatrix m, int r) {
+        //m[:,:]=numpy.random.uniform(-r, r, m.shape);
+
+        for(int row = 0; row < m.getRowDimension(); row++){
+            for(int col = 0; col < m.getColumnDimension(); col++){
+                m.addToEntry(row,col, r*unif.nextNormalizedDouble());
+            }
+        }
+    }
+
+    /**
+     * This returns the elementwise maximum of a value and a matrix. For example, if the value is 0, then this
+     * ensures that the matrix has no negative values.
+     * @param v
+     * @param m
+     * @return
+     */
+    private static RealVector maximum(double v, RealVector m){
+        for(int i = 0; i < m.getDimension(); i++){
+            double mv = m.getEntry(i);
+            m.setEntry(i, Math.max(v, mv));
+        }
+        return m;
+    }
+
+    public NeuralLM(int ngram_size, int n_input_vocab, int n_output_vocab, int input_embedding_dimension, int n_hidden, int output_embedding_dimension) {
+
+        this.n_input_vocab = n_input_vocab;
+        this.n_output_vocab = n_output_vocab;
+        this.index_to_word = new ArrayList<>();
+        this.word_to_index = new HashMap<>();
+
+        this.ngram_size = ngram_size;
+        this.input_embedding_dimension = input_embedding_dimension;
+        this.n_hidden = n_hidden;
+        this.output_embedding_dimension = output_embedding_dimension;
+
+        // createRealMatrix initializes with 0
+        this.input_embeddings = MatrixUtils.createRealMatrix(n_input_vocab, input_embedding_dimension) ; //numpy.zeros((n_vocab, input_embedding_dimension));
+        this.hidden1_weights = MatrixUtils.createRealMatrix(n_hidden, (ngram_size-1)*input_embedding_dimension) ; //numpy.zeros((n_hidden, (ngram_size - 1) * input_embedding_dimension));
+        this.hidden2_weights = MatrixUtils.createRealMatrix(output_embedding_dimension, n_hidden) ; //numpy.zeros((output_embedding_dimension, n_hidden));
+        this.output_weights = MatrixUtils.createRealMatrix(n_output_vocab, output_embedding_dimension); // numpy.zeros((n_vocab, output_embedding_dimension));
+        this.output_biases = MatrixUtils.createRealMatrix(n_output_vocab, 1); // numpy.zeros((n_vocab, 1));
+
+    }
+
+    public void initialize(int r) {
+        uniform(this.input_embeddings, r);
+        uniform(this.hidden1_weights, r);
+        uniform(this.hidden2_weights, r);
+        uniform(this.output_weights, r);
+        uniform(this.output_biases, r);
+    }
+
+    /**
+     * inputs is a list of one-hot vectors, size of each should be input_embedding_dimension.
+     * length of inputs is ngram-1. The first element of inputs is a matrix containing one-hot
+     * vectors of the first word in each ngram. The second element is a matrix containing one-hot vectors
+     * of the second word in each ngram. etc.
+     *
+     * output is a matrix containing one-hot vectors, where each vector is the last in each ngram.
+     * output is also a one-hot vector, size of output_embedding_dimension.
+     *
+     * @return
+     */
+    public double ngram_prob(List<String> ngramlist) throws Exception {
+
+        // the input here is a single ngram. Length of ngram
+        if(ngramlist.size() != ngram_size){
+            throw new Exception("ngmram-list must have the same size as ngram_size");
+        }
+
+        int unkInd = this.word_to_index.get("<unk>");
+
+        // this is the word to be predicted. ngramlist now has size ngram-1
+        String targetword = ngramlist.remove(ngramlist.size()-1);
+        int targetind;
+        if(this.word_to_index.containsKey(targetword)) {
+             targetind = this.word_to_index.get(targetword);
+        }else{
+            targetind = unkInd;
+        }
+
+        // this is a concatenated vector of one-hot values.
+        RealVector concatenated = new ArrayRealVector();
+        for(String word : ngramlist){
+            int ind;
+            if(this.word_to_index.containsKey(word)) {
+                ind = this.word_to_index.get(word);
+            }else{
+                ind = unkInd;
+            }
+            RealVector r = new ArrayRealVector(this.n_input_vocab);
+            r.setEntry(ind, 1);
+            RealVector r_embed = this.input_embeddings.transpose().operate(r);
+            concatenated = concatenated.append(r_embed);
+        }
+
+        // elementwise maximum
+        RealVector h1 = this.maximum(0., this.hidden1_weights.operate(concatenated));
+        RealVector h2 = this.maximum(0., this.hidden2_weights.operate(h1));
+
+        // this is a copout: output_biases is in practice a vector, but in type a matrix.
+        RealVector vocabdist = this.output_weights.operate(h2).add(this.output_biases.getColumnVector(0));
+
+        //o = diag_dot(output.transpose().multiply(this.output_weights), h2).add(output.transpose().multiply(this.output_biases));
+
+        return vocabdist.getEntry(targetind);
+    }
+
+    /**
+     * Not implemented in the python code.
+     */
+    public void backward_prop(Object g_output) {
+        // not sure what is meant to go here.
+    }
+
+//    def write_matrix(RealMatrix m, String outfile) {
+//
+//        for (int i = 0; i < 10; i++) {
+//            outfile.write("\t".join(map(str, m[i])));
+//            outfile.write("\n");
+//        }
+//        outfile.write("\n");
+//    }
+//
+//    public void write_vector(m, outfile) {
+//
+//        for (int i = 0; i < 10; i++) {
+//            outfile.write(str(m[i]));
+//            outfile.write("\n")
+//
+//        }
+//        outfile.write("\n");
+//    }
+
+//    public void to_file(outfile) {
+//
+//        outfile.write("\\config\n");
+//        outfile.write("version 1\n");
+//        outfile.write("ngram_size %d\n" % this.ngram_size);
+//        outfile.write("n_vocab %d\n" % this.n_vocab);
+//        outfile.write("input_embedding_dimension %d\n" % this.input_embedding_dimension);
+//        outfile.write("output_embedding_dimension %d\n" % this.output_embedding_dimension);
+//        outfile.write("n_hidden %d\n" % this.n_hidden);
+//        outfile.write("\n");
+//
+//        outfile.write("\\vocab\n");
+//        for
+//        word in
+//        this.index_to_word:
+//        outfile.write(word + "\n")
+//        outfile.write("\n")
+//
+//        outfile.write("\\input_embeddings\n")
+//
+//        write_matrix(this.input_embeddings)
+//
+//        outfile.write("\\hidden_weights 1\n")
+//
+//        write_matrix(this.hidden1_weights)
+//
+//        outfile.write("\\hidden_weights 2\n")
+//
+//        write_matrix(this.hidden2_weights)
+//
+//        outfile.write("\\output_weights\n")
+//
+//        write_matrix(this.output_weights)
+//
+//        outfile.write("\\output_biases\n")
+//
+//        write_matrix(this.output_biases)
+//
+//        outfile.write("\\end\n")
+//    }
+
+
+    /**
+     * This reads until the next section.
+     * @param iter
+     * @return
+     */
+    private static String read_sections(Iterator<String> iter){
+        while(true){
+            String line = iter.next().trim();
+
+            if(line.equals("\\end"))
+                break;
+            else if(line.startsWith("\\")){
+                return line;
+            }
+        }
+        return "\\end";
+    }
+
+    private static List<String> read_section(Iterator<String> iter) {
+        List<String> ret = new ArrayList<>();
+        while(true) {
+            String line = iter.next().trim();
+            if(line.isEmpty()) {
+                break;
+            }
+            else {
+                ret.add(line);
+            }
+        }
+        return ret;
+    }
+
+    public static RealMatrix read_matrix(List<String> lines, int m, int n) throws Exception {
+
+        RealMatrix out = MatrixUtils.createRealMatrix(m, n);
+
+        int i = 0;
+        for(String line : lines) {
+            String[] sline = line.split("\t");
+
+            //numpy.array(map(float,line.split()));
+            double[] row = new double[sline.length];
+            for(int j = 0; j < sline.length; j++){
+                row[j] = Double.parseDouble(sline[j]);
+            }
+
+            if(row.length != n) {
+                throw new Exception(String.format("wrong number of columns (expected %d, found %d)", n, row.length));
+            }
+            if(i >= m) {
+                throw new Exception(String.format("wrong number of rows (expected %d, found more)", m));
+            }
+            //out[i,:] = row;
+            out.setRow(i, row);
+
+            i += 1;
+        }
+        if(i < m){
+            throw new Exception(String.format("wrong number of rows (expected %d, found %d)", m, i));
+        }
+
+        return out;
+    }
+
+    /**
+     * Create a NeuralLM from a text file.
+     */
+    public static NeuralLM from_file(String infile) throws Exception {
+
+        List<String> lines = LineIO.read(infile);
+        Iterator<String> iter = lines.iterator();
+
+        NeuralLM m = null;
+
+        while(iter.hasNext()){
+            String section = read_sections(iter);
+
+            if(section.contains("end")){
+                break;
+            }
+
+            List<String> sectionlines = read_section(iter);
+
+            if(section.equals("\\config")) {
+                HashMap<String, String> config = new HashMap<>();
+
+                for(String line : sectionlines) {
+                    String[] sline = line.split(" ");
+                    String key = sline[0];
+                    String value = sline[1];
+                    config.put(key, value);
+                }
+
+                m = new NeuralLM(Integer.parseInt(config.get("ngram_size")),
+                        Integer.parseInt(config.get("input_vocab_size")),
+                        Integer.parseInt(config.get("output_vocab_size")),
+                        Integer.parseInt(config.get("input_embedding_dimension")),
+                        Integer.parseInt(config.get("num_hidden")),
+                        Integer.parseInt(config.get("output_embedding_dimension")));
+
+            } else if(section.equals("\\input_vocab")) {
+
+                int i = 0;
+                for (String line : sectionlines) {
+                    m.index_to_word.add(line);
+                    m.word_to_index.put(line, i++);
+                }
+            }else if(section.equals("\\output_vocab")){
+                // FIXME: not sure there is anything I need to do???
+            } else if(section.equals("\\input_embeddings")) {
+                logger.debug("got here");
+                m.input_embeddings = read_matrix(sectionlines, m.n_input_vocab, m.input_embedding_dimension);
+            }else if(section.equals("\\hidden_weights 1")) {
+                m.hidden1_weights = read_matrix(sectionlines, m.n_hidden, (m.ngram_size - 1) * m.input_embedding_dimension);
+            }else if(section.equals("\\hidden_weights 2")) {
+                m.hidden2_weights = read_matrix(sectionlines, m.output_embedding_dimension, m.n_hidden);
+            } else if(section.equals("\\output_weights")) {
+                m.output_weights = read_matrix(sectionlines, m.n_output_vocab, m.output_embedding_dimension);
+            } else if(section.equals("\\output_biases")) {
+                m.output_biases = read_matrix(sectionlines, m.n_output_vocab, 1);
+            }
+        }
+        return m;
+    }
+
+//    /**
+//     * Takes a list of n-grams of words (as ints),
+//     * and converts into a list of n sparse arrays.
+//     */
+//    public static void make_data(List<List<String>> ngrams) {
+//
+//        rows =[[]for
+//        j in
+//
+//        xrange(this.ngram_size)
+//
+//        ]
+//        cols =[[]for
+//        j in
+//
+//        xrange(this.ngram_size)
+//
+//        ]
+//        values =[[]for
+//        j in
+//
+//        xrange(this.ngram_size)
+//
+//        ]
+//        for i,
+//                ngram in
+//
+//        enumerate(ngrams)
+//
+//        :
+//        for j,
+//                w in
+//
+//        enumerate(ngram)
+//
+//        :
+//        rows[j].
+//
+//                append(w)
+//
+//        cols[j].
+//
+//                append(i)
+//
+//        values[j].
+//
+//                append(1)
+//
+//        data =[scipy.sparse.csc_matrix((values[j], (rows[j], cols[j])),shape = (this.n_vocab,
+//
+//                len(ngrams)
+//
+//        ))for
+//        j in
+//
+//        xrange(this.ngram_size)
+//
+//        ]
+//        return data;
+//    }
+}
\ No newline at end of file
--- a/src/main/java/edu/illinois/cs/cogcomp/NplmJni.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/NplmJni.java
+package edu.illinois.cs.cogcomp;
+
+/**
+ * Created by mayhew2 on 11/18/15.
+ */
+public class NplmJni {
+
+    static {
+        System.loadLibrary("nplm");
+    }
+
+    private native void sayHello();
+
+    public static void main(String[] args){
+        new NplmJni().sayHello();
+    }
+
+}
--- a/src/main/java/edu/illinois/cs/cogcomp/Tester.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/Tester.java
+package edu.illinois.cs.cogcomp;
+
+import org.apache.commons.math3.linear.*;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Created by mayhew2 on 11/19/15.
+ */
+public class Tester {
+
+    public static void main(String[] args) throws Exception {
+
+        NeuralLM m = NeuralLM.from_file("/shared/experiments/mayhew2/transliteration/NEURAL_LANGUAGE_MODEL/example/inferno.nnlm");
+
+        List<String> ngrams = new ArrayList<>();
+
+        ngrams.add("fair");
+        ngrams.add("and");
+        ngrams.add("xkcd");
+
+        System.out.println(m.ngram_prob(ngrams));
+
+    }
+
+}
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
+# suppress inspection "UnusedProperty" for whole file
+# Root logger option
+log4j.rootLogger=DEBUG, stdout
+
+# Direct log messages to stdout
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target=System.out
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{HH:mm:ss} %-5p %c{1}:%L - %m%n
\ No newline at end of file