Skip to content
Snippets Groups Projects
Commit 7e2c3e79 authored by Christos Christodoulopoulos's avatar Christos Christodoulopoulos
Browse files

Massive cleaning up

Using new AnnotatorService
parent ae4e3cee
No related branches found
No related tags found
1 merge request!1Shyam
Showing
with 197 additions and 379 deletions
Version 5.1.4
Switched entirely to illinois-sl for structured prediction (removed JLIS traces)
Using the latest AnnotatorService from illinois-core-utilities for both Curator & pipeline annotation
Major cleaning up
Version 5.1
Added JUnit tests
Removed unnecessary dependencies
......
# illinois-srl: Semantic Role Labeler
### Running
#### Interactive mode
#### As `Annotator`
### Training
# {L2LossSSVM, StructuredPerceptron}
LEARNING_MODEL = L2LossSSVM
# {DCDSolver, ParallelDCDSolver, DEMIParallelDCDSolver};
L2_LOSS_SSVM_SOLVER_TYPE = ParallelDCDSolver
NUMBER_OF_THREADS = 8
C_FOR_STRUCTURE = 1.0
TRAINMINI = false
TRAINMINI_SIZE = 1000
STOP_CONDITION = 0.1
CHECK_INFERENCE_OPT = false
MAX_NUM_ITER = 250
PROGRESS_REPORT_ITER = 10
INNER_STOP_CONDITION = 0.1
MAX_ITER_INNER = 250
MAX_ITER_INNER_FINAL = 2500
TOTAL_NUMBER_FEATURE = -1
CLEAN_CACHE = true
CLEAN_CACHE_ITER = 5
LEARNING_RATE = 0.01
DECAY_LEARNING_RATE = false
# {L2LossSSVM, StructuredPerceptron}
LEARNING_MODEL = StructuredPerceptron
NUMBER_OF_THREADS = 1
CHECK_INFERENCE_OPT = false
MAX_NUM_ITER = 10
TOTAL_NUMBER_FEATURE = -1
LEARNING_RATE = 0.01
DECAY_LEARNING_RATE = false
NUMBER_OF_FEATURE_BITS = 26
# Available learning models: {L2LossSSVM, StructuredPerceptron}
LEARNING_MODEL = L2LossSSVM
# Available solver types: {DCDSolver, ParallelDCDSolver, DEMIParallelDCDSolver}
L2_LOSS_SSVM_SOLVER_TYPE = ParallelDCDSolver
NUMBER_OF_THREADS = 8
# Regularization parameter
C_FOR_STRUCTURE = 1.0
# Mini-batch for 'warm' start
TRAINMINI = true
TRAINMINI_SIZE = 10000
# Suppress optimatility check
CHECK_INFERENCE_OPT = false
# Number of training rounds
MAX_NUM_ITER = 100
useStanfordConvention true
# Conll config file
# Required fields
configFilename finalSystemBILOU
pathToModelFile data/Models/CoNLL
taggingEncodingScheme BILOU
tokenizationScheme DualTokenizationScheme
# Optional fields
beamSize 5
forceNewSentenceOnLineBreaks true
labelTypes PER ORG LOC MISC
logging false
# debuggingLogPath irrelevant
inferenceMethod GREEDY
normalizeTitleText false
pathToTokenNormalizationData brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt
predictionConfidenceThreshold -1
sortLexicallyFilesInFolders true
thresholdPrediction false
treatAllFilesInFolderAsOneBigDocument true
debug true
# Features
Forms 1
Capitalization 1
WordTypeInformation 1
Affixes 1
PreviousTag1 1
PreviousTag2 1
PreviousTagPatternLevel1 1
PreviousTagPatternLevel2 1
AggregateContext 0
AggregateGazetteerMatches 0
PrevTagsForContext 1
PredictionsLevel1 0
# Feature groups
BrownClusterPaths 1
isLowercaseBrownClusters false false false
pathsToBrownClusters brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt brown-clusters/brownBllipClusters brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt
minWordAppThresholdsForBrownClusters 5 5 5
GazetteersFeatures 1
pathToGazetteersLists ner-ext/KnownLists
WordEmbeddings 0
# pathsToWordEmbeddings WordEmbedding/model-2280000000.LEARNING_RATE=1e-08.EMBEDDING_LEARNING_RATE=1e-07.EMBEDDING_SIZE=50.gz
# embeddingDimensionalities 50
# minWordAppThresholdsForEmbeddings 0
# normalizationConstantsForEmbeddings 1.0
# normalizationMethodsForEmbeddings OVERALL
# isLowercaseWordEmbeddings false
# Ontonotes config file
# Required fields
configFilename Ontonotes
pathToModelFile data/Models/Ontonotes
taggingEncodingScheme BILOU
tokenizationScheme DualTokenizationScheme
# Optional fields
beamSize 5
forceNewSentenceOnLineBreaks true
labelTypes TIME LAW GPE NORP LANGUAGE PERCENT FAC PRODUCT ORDINAL LOC PERSON WORK_OF_ART MONEY DATE EVENT QUANTITY ORG CARDINAL
logging false
# debuggingLogPath irrelevant
inferenceMethod GREEDY
normalizeTitleText false
pathToTokenNormalizationData brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt
predictionConfidenceThreshold -1
sortLexicallyFilesInFolders true
thresholdPrediction false
treatAllFilesInFolderAsOneBigDocument false
debug true
# Features
Forms 1
Capitalization 1
WordTypeInformation 1
Affixes 1
PreviousTag1 1
PreviousTag2 1
PreviousTagPatternLevel1 1
PreviousTagPatternLevel2 1
AggregateContext 0
AggregateGazetteerMatches 0
PrevTagsForContext 1
PredictionsLevel1 0
# Feature groups
BrownClusterPaths 1
isLowercaseBrownClusters false false false
pathsToBrownClusters brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt brown-clusters/brownBllipClusters brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt
minWordAppThresholdsForBrownClusters 5 5 5
GazetteersFeatures 1
pathToGazetteersLists ner-ext/KnownLists
WordEmbeddings 0
# pathsToWordEmbeddings WordEmbedding/model-2280000000.LEARNING_RATE=1e-08.EMBEDDING_LEARNING_RATE=1e-07.EMBEDDING_SIZE=50.gz
# embeddingDimensionalities 50
# minWordAppThresholdsForEmbeddings 0
# normalizationConstantsForEmbeddings 1.0
# normalizationMethodsForEmbeddings OVERALL
# isLowercaseWordEmbeddings false
# Use ResourceManager to read these properties
curatorHost = trollope.cs.illinois.edu
curatorPort = 9010
# If set to true, it will force Curator to re-annotate the input
curatorForceUpdate = false
nerConllConfig config/ner-conll-config.properties
nerOntonotesConfig config/ner-ontonotes-config.properties
lemmaConfig config/lemmatizer-config.properties
# If set to true, the output will be a Curator Record instead of a TextAnnotation.
useRecords = false
# If set to true, the input text will be assumed to be pre-tokenized
respectTokenization = false
# A comma-separated list of views to add (see ViewNames for a complete list of views).
viewsToAdd = POS,SHALLOW_PARSE,LEMMA,COREF,NER_CONLL,PARSE_STANFORD
# Use this option to output the annotated Record/TextAnnotation as plain text (instead of serialized)
outputToText = true
# Force the Curator client to overwrite the generated output files
forceUpdateOutputFile = true
## Properties used by the AnnotatorService to control caching behaviour
cacheDirectory = annotation-cache
throwExceptionIfNotCached = false
cacheHeapSizeInMegabytes = 100
cacheDiskSizeInMegabytes = 200
# sets system property to close cache when VM shuts down
setCacheShutdownHook = true
#############################################
##
## Illinois SRL Configuration
......@@ -41,13 +7,6 @@ setCacheShutdownHook = true
# Whether to use the Illinois Curator to get the required annotations for training/testing
# If set to false, Illinois NLP pipeline will be used
UseCurator = false
# The URL and host of Curator. If UseCurator is false, make sure you have pipeline config file set
CuratorHost = trollope.cs.illinois.edu
CuratorPort = 9010
# The file containing the configuration for the Illinois NLP pipeline
PipelineConfigFile = config/pipeline-config.properties
# The parser used to extract constituents and syntactic features
# Options are: Charniak, Berkeley, Stanford
......@@ -56,6 +15,9 @@ DefaultParser = Stanford
WordNetConfig = jwnl_properties.xml
# The configuration for the Structured learner
LearnerConfig = config/learner.properties
### Training corpora directories ###
# This is the directory of the merged (mrg) WSJ files
PennTreebankHome = /shared/corpora/corporaWeb/treebanks/eng/pennTreebank/treebank-3/parsed/mrg/wsj/
......@@ -64,13 +26,10 @@ NombankHome = /shared/corpora/corporaWeb/treebanks/eng/nombank/
# The directory of the sentence and pre-extracted features database (~5G of space required)
# Not used during test/working with pre-trained models
# TODO Change this when done
CacheDirectory = /shared/bronte/upadhya3/illinoisSRL/cache
CacheDirectory = cache
ModelsDirectory = /shared/bronte/upadhya3/illinoisSRL/models
ModelsDirectory = models
# Directory to output gold and predicted files for manual comparison
# Comment out for no output
OutputDirectory = srl-out
MaxInferenceRounds = 200
\ No newline at end of file
OutputDirectory = srl-out
\ No newline at end of file
......@@ -29,33 +29,31 @@
<artifactId>illinois-srl</artifactId>
<classifier>models-verb-stanford</classifier>
<version>5.1.4</version>
<scope>test</scope>
</dependency>
<!-- <dependency> -->
<!-- <groupId>edu.illinois.cs.cogcomp</groupId> -->
<!-- <artifactId>illinois-srl</artifactId> -->
<!-- <classifier>models-nom-stanford</classifier> -->
<!-- <version>5.1.4</version> -->
<!-- <scope>test</scope> -->
<!-- </dependency> -->
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-srl</artifactId>
<classifier>models-nom-stanford</classifier>
<version>5.1.4</version>
</dependency>
<!--The Illinois pipeline can be used instead -->
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-nlp-pipeline</artifactId>
<version>0.1.8-lemmatizer</version>
<version>0.1.9</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>1.2.18</version>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-caching-curator</artifactId>
<version>2.1.1</version>
<artifactId>illinois-curator</artifactId>
<version>3.1.1</version>
</dependency>
<dependency>
<groupId>com.gurobi</groupId>
......@@ -106,6 +104,13 @@
<artifactId>snowball</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.7</version>
<optional>true</optional>
</dependency>
</dependencies>
......@@ -148,12 +153,27 @@
</extensions>
</build>
<reporting>
<excludeDefaults>true</excludeDefaults>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.3</version>
</plugin>
</plugins>
</reporting>
<distributionManagement>
<repository>
<id>CogcompSoftware</id>
<name>CogcompSoftware</name>
<url>scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/m2repo</url>
</repository>
<site>
<id>CogcompSoftwareDoc</id>
<url>scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/software/doc/${project.artifactId}</url>
</site>
</distributionManagement>
</project>
#!/bin/bash -e
VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | grep -v 'INFO'`
tmpdir=tmp-Srl-verb-$RANDOM
rm -rdf ${tmpdir}
mkdir -p ${tmpdir}/models
for parser in STANFORD CHARNIAK; do
cp ./models/Verb*${parser}* ${tmpdir}/models
cd ${tmpdir}
rm -rdf ../target/illinois-srl-models-verb-${parser}-${VERSION}.jar
jar cf ../target/illinois-srl-models-verb-${parser}-${VERSION}.jar models
cd ..
rm ${tmpdir}/models/*
done
rm -rdf ${tmpdir}
tmpdir=tmp-Srl-nom-$RANDOM
rm -rdf ${tmpdir}
mkdir -p ${tmpdir}/models
for parser in STANFORD CHARNIAK; do
cp ./models/Nom*${parser}* ${tmpdir}/models
cd ${tmpdir}
rm -rdf ../target/illinois-srl-models-nom-${parser}-${VERSION}.jar
jar cf ../target/illinois-srl-models-nom-${parser}-${VERSION}.jar models
cd ..
rm ${tmpdir}/models/*
done
rm -rdf ${tmpdir}
\ No newline at end of file
......@@ -2,6 +2,51 @@
VERSION=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | grep -v 'INFO'`
tmpdir=tmp-Srl-verb-$RANDOM
rm -rdf ${tmpdir}
mkdir -p ${tmpdir}/models
for parser in STANFORD CHARNIAK; do
if [ ! -e "./models/Verb.Classifier.PARSE_${parser}.lex" ]; then
echo "$parser Verb models not found"
continue
fi
cp ./models/Verb*${parser}* ${tmpdir}/models
cd ${tmpdir}
rm -rdf ../target/illinois-srl-models-verb-${parser}-${VERSION}.jar
jar cf ../target/illinois-srl-models-verb-${parser}-${VERSION}.jar models
cd ..
rm ${tmpdir}/models/*
done
rm -rdf ${tmpdir}
tmpdir=tmp-Srl-nom-$RANDOM
rm -rdf ${tmpdir}
mkdir -p ${tmpdir}/models
for parser in STANFORD CHARNIAK; do
if [ ! -e "./models/Nom.Classifier.PARSE_${parser}.lex" ]; then
echo "$parser Nom models not found"
continue
fi
cp ./models/Nom*${parser}* ${tmpdir}/models
cd ${tmpdir}
rm -rdf ../target/illinois-srl-models-nom-${parser}-${VERSION}.jar
jar cf ../target/illinois-srl-models-nom-${parser}-${VERSION}.jar models
cd ..
rm ${tmpdir}/models/*
done
rm -rdf ${tmpdir}
echo "Compiled models to jars"
if [ -e "target/illinois-srl-models-nom-CHARNIAK-${VERSION}.jar" ]; then
echo "Deploying illinois-srl-models-nom-CHARNIAK-${VERSION}.jar"
mvn deploy:deploy-file \
-Dfile=target/illinois-srl-models-nom-CHARNIAK-${VERSION}.jar \
......@@ -12,7 +57,9 @@ mvn deploy:deploy-file \
-Dpackaging=jar \
-Durl=scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/m2repo \
-DrepositoryId=CogcompSoftware
fi
if [ -e "target/illinois-srl-models-nom-STANFORD-${VERSION}.jar" ]; then
echo "Deploying illinois-srl-models-nom-STANFORD-${VERSION}.jar"
mvn deploy:deploy-file \
-Dfile=target/illinois-srl-models-nom-STANFORD-${VERSION}.jar \
......@@ -23,7 +70,9 @@ mvn deploy:deploy-file \
-Dpackaging=jar \
-Durl=scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/m2repo \
-DrepositoryId=CogcompSoftware
fi
if [ -e "target/illinois-srl-models-verb-CHARNIAK-${VERSION}.jar" ]; then
echo "Deploying illinois-srl-models-verb-CHARNIAK-${VERSION}.jar"
mvn deploy:deploy-file \
-Dfile=target/illinois-srl-models-verb-CHARNIAK-${VERSION}.jar \
......@@ -34,7 +83,9 @@ mvn deploy:deploy-file \
-Dpackaging=jar \
-Durl=scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/m2repo \
-DrepositoryId=CogcompSoftware
fi
if [ -e "target/illinois-srl-models-verb-STANFORD-${VERSION}.jar" ]; then
echo "Deploying illinois-srl-models-verb-STANFORD-${VERSION}.jar"
mvn deploy:deploy-file \
-Dfile=target/illinois-srl-models-verb-STANFORD-${VERSION}.jar \
......@@ -44,4 +95,5 @@ mvn deploy:deploy-file \
-Dclassifier=models-verb-stanford \
-Dpackaging=jar \
-Durl=scp://bilbo.cs.illinois.edu:/mounts/bilbo/disks/0/www/cogcomp/html/m2repo \
-DrepositoryId=CogcompSoftware
\ No newline at end of file
-DrepositoryId=CogcompSoftware
fi
\ No newline at end of file
#!/bin/bash
# mvn clean compile
# mvn -q dependency:copy-dependencies
mvn clean compile
mvn -q dependency:copy-dependencies
CP=target/classes:config:target/dependency/*
......
......@@ -70,46 +70,36 @@ public class ColumnFormatWriter {
printFormatted(tr, out, pav.getTextAnnotation());
}
private void printFormatted(String[][] columns, PrintWriter out,
TextAnnotation ta) {
// System.out.println(columns.length);
// System.out.println(columns[0].length);
private void printFormatted(String[][] columns, PrintWriter out, TextAnnotation ta) {
// leftOfStar: length of everything before the asterisk.
// rightOfStar: length of asterisk and what comes after.
int[] leftOfStar = new int[columns[0].length];
int[] rightOfStart = new int[columns[0].length];
for (int row = 0; row < columns.length; row++) {
// System.out.println(row);
// System.out.println(Arrays.asList(columns[row]));
for (String[] rowData : columns) {
for (int col = 0; col < rowData.length; col++) {
String[] rowData = columns[row];
String word = rowData[col];
for (int col = 0; col < rowData.length; col++) {
int starPos = word.indexOf("*");
String word = rowData[col];
int lenLeft, lenRight;
if (starPos < 0) {
lenLeft = word.length();
lenRight = -1;
} else {
lenLeft = starPos + 1;
lenRight = word.length() - starPos + 1;
}
int starPos = word.indexOf("*");
int lenLeft, lenRight;
if (starPos < 0) {
lenLeft = word.length();
lenRight = -1;
} else {
lenLeft = starPos + 1;
lenRight = word.length() - starPos + 1;
}
if (leftOfStar[col] < lenLeft)
leftOfStar[col] = lenLeft;
if (leftOfStar[col] < lenLeft)
leftOfStar[col] = lenLeft;
if (rightOfStart[col] < lenRight)
rightOfStart[col] = lenRight;
}
}
if (rightOfStart[col] < lenRight)
rightOfStart[col] = lenRight;
}
}
// System.out.println("here");
......@@ -124,13 +114,6 @@ public class ColumnFormatWriter {
out.print(rowData[0]);
// if (rowData.length == 1)
// {
// // System.out.println(rowData[0] + "\t" + row);
// out.println();
// continue;
// }
// print the spaces
for (int spCount = rowData[0].length(); spCount < leftOfStar[0]; spCount++)
out.print(" ");
......@@ -181,10 +164,6 @@ public class ColumnFormatWriter {
}
}
/**
* @param columns
* @return
*/
private String[][] transpose(List<String[]> columns, int size) {
String[][] output = new String[size][];
......@@ -204,12 +183,8 @@ public class ColumnFormatWriter {
/**
* Return a table. Numrows = number of words. Num Cols depends on how many
* predicate arg relations we have
*
* @param ta
* @return
*/
private String[][] transformToColumns(TextAnnotation ta) {
List<String[]> columns = new ArrayList<>();
// first the words
......@@ -245,10 +220,6 @@ public class ColumnFormatWriter {
return transpose(columns, ta.size());
}
/**
* @param ta
* @return
*/
private static String[] getNEData(TextAnnotation ta) {
if (!ta.hasView(ViewNames.NER_CONLL)) {
......@@ -260,7 +231,7 @@ public class ColumnFormatWriter {
return chunk;
}
SpanLabelView nerView = (SpanLabelView) ta.getView(ViewNames.NER);
SpanLabelView nerView = (SpanLabelView) ta.getView(ViewNames.NER_CONLL);
List<Constituent> nerConstituents = nerView.getConstituents();
......@@ -304,13 +275,11 @@ public class ColumnFormatWriter {
return chunk;
}
SpanLabelView chunkView = (SpanLabelView) ta
.getView(ViewNames.SHALLOW_PARSE);
SpanLabelView chunkView = (SpanLabelView) ta.getView(ViewNames.SHALLOW_PARSE);
List<Constituent> chunkConstituents = chunkView.getConstituents();
Collections.sort(chunkConstituents,
TextAnnotationUtilities.constituentStartComparator);
Collections.sort(chunkConstituents, TextAnnotationUtilities.constituentStartComparator);
Map<Integer, String> cc = new HashMap<>();
for (Constituent c : chunkConstituents) {
......@@ -337,16 +306,11 @@ public class ColumnFormatWriter {
return chunk;
}
/**
* @param columns
* @param ta
*/
private void addPredicateArgs(List<String[]> columns, TextAnnotation ta) {
PredicateArgumentView predArgView = null;
if (ta.hasView(predicateArgumentViewName))
predArgView = (PredicateArgumentView) ta
.getView(predicateArgumentViewName);
predArgView = (PredicateArgumentView) ta.getView(predicateArgumentViewName);
convertPredicateArgView(ta, predArgView, columns, true);
......@@ -359,15 +323,15 @@ public class ColumnFormatWriter {
if (pav != null)
predicates = pav.getPredicates();
Collections.sort(predicates,
TextAnnotationUtilities.constituentStartComparator);
Collections.sort(predicates, TextAnnotationUtilities.constituentStartComparator);
int size = ta.size();
addPredicateInfo(columns, predicates, size, addSense);
for (Constituent predicate : predicates) {
List<Relation> args = pav.getArguments(predicate);
assert pav != null;
List<Relation> args = pav.getArguments(predicate);
String[] paInfo = addPredicateArgInfo(predicate, args, size);
......@@ -375,21 +339,14 @@ public class ColumnFormatWriter {
}
}
/**
* @param columns
* @param predicates
* @param size
*/
private void addPredicateInfo(List<String[]> columns,
List<Constituent> predicates, int size, boolean addSense) {
Map<Integer, String> senseMap = new HashMap<>();
Map<Integer, String> lemmaMap = new HashMap<>();
for (Constituent c : predicates) {
senseMap.put(c.getStartSpan(),
c.getAttribute(CoNLLColumnFormatReader.SenseIdentifer));
lemmaMap.put(c.getStartSpan(),
c.getAttribute(CoNLLColumnFormatReader.LemmaIdentifier));
senseMap.put(c.getStartSpan(), c.getAttribute(CoNLLColumnFormatReader.SenseIdentifer));
lemmaMap.put(c.getStartSpan(), c.getAttribute(CoNLLColumnFormatReader.LemmaIdentifier));
}
String[] sense = new String[size];
......@@ -406,19 +363,11 @@ public class ColumnFormatWriter {
}
}
// System.out.println(Arrays.asList(sense));
// System.out.println(Arrays.asList(lemma));
if (addSense)
columns.add(sense);
columns.add(lemma);
}
/**
* @param predicate
* @param args
* @param size
* @return
*/
private String[] addPredicateArgInfo(Constituent predicate,
List<Relation> args, int size) {
Map<Integer, String> paInfo = new HashMap<>();
......@@ -430,8 +379,7 @@ public class ColumnFormatWriter {
argPredicate = argPredicate.replaceAll("ARG", "A");
argPredicate = argPredicate.replaceAll("Support", "SUP");
for (int i = r.getTarget().getStartSpan(); i < r.getTarget()
.getEndSpan(); i++) {
for (int i = r.getTarget().getStartSpan(); i < r.getTarget().getEndSpan(); i++) {
paInfo.put(i, "*");
if (i == r.getTarget().getStartSpan())
paInfo.put(i, "(" + argPredicate + paInfo.get(i));
......@@ -453,27 +401,18 @@ public class ColumnFormatWriter {
return paColumn;
}
/**
* @param ta
* @return
*/
private String[] getParse(TextAnnotation ta) {
String[] parse = new String[ta.size()];
for (int sentenceId = 0; sentenceId < ta.getNumberOfSentences(); sentenceId++) {
Tree<String> tree = ParseHelper.getParseTree(parseViewName, ta,
sentenceId);
Tree<String> tree = ParseHelper.getParseTree(parseViewName, ta, sentenceId);
Sentence sentence = ta.getSentence(sentenceId);
tree = ParseUtils.snipNullNodes(tree);
tree = ParseUtils.stripFunctionTags(tree);
// tree = tree.getChild(0);
// System.out.println("tree is :" + tree);
String[] treeLines = tree.toString().split("\n");
if (treeLines.length != sentence.size()) {
......
......@@ -5,7 +5,7 @@ package edu.illinois.cs.cogcomp.srl;
* TODO Change this before shipping
*/
public class Constants {
public final static String systemVersion = "5.1";
public final static String systemVersion = "5.4.1";
public final static String systemName = "illinoisSRL";
......
......@@ -97,24 +97,30 @@ public class Main {
@CommandDescription(description = "Performs the full training & testing sequence for all SRL types",
usage = "expt [Verb | Nom] cacheDatasets=[true | false]")
public static void expt(String srlType, String cacheDatasets, String learnerConfig) throws Exception {
public static void expt(String srlType, String cacheDatasets) throws Exception {
// Step 1: Cache all the datasets we're going to use
if (Boolean.parseBoolean(cacheDatasets)) cacheDatasets();
// Step 2: Iterate between pre-extracting all the features needed for training and training
// We don't need to train a predicate classifier for Verb
if (SRLType.valueOf(srlType) == SRLType.Nom) {
preExtract(srlType, "Predicate");
train(srlType, "Predicate");
}
preExtract(srlType, "Sense");
train(srlType, "Sense",learnerConfig);
train(srlType, "Sense");
preExtract(srlType, "Identifier");
train(srlType, "Identifier",learnerConfig);
train(srlType, "Identifier");
tuneIdentifier(srlType);
preExtract(srlType, "Classifier");
train(srlType, "Classifier",learnerConfig);
train(srlType, "Classifier");
// Step 3: Evaluate
evaluate(srlType);
System.out.println("All Done!");
}
@CommandDescription(description = "Reads and caches all the datasets", usage = "cacheDatasets")
......@@ -185,7 +191,7 @@ public class Main {
Counter<String> addedViews = new Counter<>();
log.info("Initializing pre-processor");
TextPreProcessor.initialize(configFile, true);
TextPreProcessor.initialize(configFile);
int count = 0;
while (dataset.hasNext()) {
......@@ -229,10 +235,20 @@ public class Main {
public static SRLManager getManager(SRLType srlType, boolean trainingMode) throws Exception {
String viewName;
if (defaultParser == null) defaultParser = SRLProperties.getInstance().getDefaultParser();
if (defaultParser.equals("Charniak")) viewName = ViewNames.PARSE_CHARNIAK;
else if (defaultParser.equals("Berkeley")) viewName = ViewNames.PARSE_BERKELEY;
else if (defaultParser.equals("Stanford")) viewName = ViewNames.PARSE_STANFORD;
else viewName = defaultParser;
switch (defaultParser) {
case "Charniak":
viewName = ViewNames.PARSE_CHARNIAK;
break;
case "Berkeley":
viewName = ViewNames.PARSE_BERKELEY;
break;
case "Stanford":
viewName = ViewNames.PARSE_STANFORD;
break;
default:
viewName = defaultParser;
break;
}
if (srlType == SRLType.Verb)
return new VerbSRLManager(trainingMode, viewName);
......@@ -275,7 +291,6 @@ public class Main {
String allDataCacheFile = properties.getFeatureCacheFile(srlType,
modelToExtract, featureSet, defaultParser, dataset);
System.out.println("reading feature cache from " + allDataCacheFile);
FeatureVectorCacheFile featureCache = preExtract(numConsumers, manager,
modelToExtract, dataset, allDataCacheFile, false);
......@@ -307,9 +322,6 @@ public class Main {
if (IOUtils.exists(cacheFile2)) {
log.warn("Old pruned cache file found. Not doing anything...");
return;
// log.warn("Old pruned cache file found. Deleting...");
// IOUtils.rm(cacheFile2);
// log.info("Done");
}
log.info("Pruning features. Saving pruned features to {}", cacheFile2);
......@@ -358,7 +370,7 @@ public class Main {
@CommandDescription(description = "Trains a specific model and SRL type",
usage = "train [Verb | Nom] [Predicate | Sense | Identifier | Classifier]")
public static void train(String srlType_, String model_, String learnerConfig) throws Exception {
public static void train(String srlType_, String model_) throws Exception {
SRLType srlType = SRLType.valueOf(srlType_);
SRLManager manager = getManager(srlType, true);
......@@ -392,7 +404,7 @@ public class Main {
log.info("Setting up solver, learning may take time if you have too many instances in SLProblem ....");
SLParameters params = new SLParameters();
params.loadConfigFile(learnerConfig);
params.loadConfigFile(properties.getLearnerConfig());
params.C_FOR_STRUCTURE = (float) c;
SRLMulticlassInference infSolver = new SRLMulticlassInference(manager, model);
......@@ -504,12 +516,10 @@ public class Main {
count++;
if (count % 1000 == 0) {
long end = System.currentTimeMillis();
log.info(count + " sentences done. Took "
+ (end - start) + "ms, F1 so far = "
+ tester.getAverageF1());
log.info(count + " sentences done. Took " + (end - start) + "ms, " +
"F1 so far = " + tester.getAverageF1());
}
}
System.exit(-1);
long end = System.currentTimeMillis();
System.out.println(count + " sentences done. Took " + (end - start) + "ms");
......
......@@ -19,25 +19,18 @@ public class SRLProperties {
private static final Logger log = LoggerFactory.getLogger(SRLProperties.class);
private static SRLProperties theInstance;
private PropertiesConfiguration config;
private final String curatorHost;
private final int curatorPort, maxInferenceRounds;
private final String wordNetFile;
private final String wordNetFile;
private SRLProperties(URL url) throws ConfigurationException {
config = new PropertiesConfiguration(url);
curatorHost = config.getString("CuratorHost", "");
curatorPort = config.getInt("CuratorPort", -1);
this.wordNetFile = config.getString("WordNetConfig");
this.wordNetFile = config.getString("WordNetConfig");
if (config.containsKey("LoadWordNetConfigFromClassPath")
&& config.getBoolean("LoadWordNetConfigFromClassPath")) {
WordNetManager.loadConfigAsClasspathResource(true);
}
maxInferenceRounds = config.getInt("MaxInferenceRounds");
}
}
public static void initialize(String configFile) throws Exception {
// first try to load the file from the file system
......@@ -156,19 +149,7 @@ public class SRLProperties {
return Constants.systemVersion;
}
public String getPipelineConfigFile() {
return config.getString("PipelineConfigFile");
}
public String getCuratorHost() {
return curatorHost;
}
public int getCuratorPort() {
return curatorPort;
}
public int getMaxInferenceRounds() {
return maxInferenceRounds;
public String getLearnerConfig() {
return this.config.getString("LearnerConfig");
}
}
......@@ -95,7 +95,7 @@ public class SemanticRoleLabeler implements Annotator {
properties = SRLProperties.getInstance();
log.info("Initializing pre-processor");
TextPreProcessor.initialize(configFile, false);
TextPreProcessor.initialize(configFile);
log.info("Creating {} manager", srlType);
manager = Main.getManager(SRLType.valueOf(srlType), false);
......
......@@ -3,7 +3,6 @@ package edu.illinois.cs.cogcomp.srl.caches;
import edu.illinois.cs.cogcomp.core.datastructures.Pair;
import edu.illinois.cs.cogcomp.sl.core.SLProblem;
import edu.illinois.cs.cogcomp.sl.util.IFeatureVector;
import edu.illinois.cs.cogcomp.sl.util.SparseFeatureVector;
import edu.illinois.cs.cogcomp.srl.core.Models;
import edu.illinois.cs.cogcomp.srl.core.SRLManager;
import edu.illinois.cs.cogcomp.srl.jlis.SRLMulticlassInstance;
......@@ -86,7 +85,7 @@ public class FeatureVectorCacheFile implements Closeable,
String features = parts[2];
SRLMulticlassInstance x = new SRLMulticlassInstance(model, lemma, features);
SRLMulticlassLabel y = new SRLMulticlassLabel(x, label, model, manager);
SRLMulticlassLabel y = new SRLMulticlassLabel(label, model, manager);
return new Pair<>(x, y);
} catch (Exception e) {
......
......@@ -33,13 +33,11 @@ public abstract class AbstractPredicateDetector {
List<Constituent> list = new ArrayList<>();
for (int i = 0; i < ta.size(); i++) {
Option<String> opt = getLemma(ta, i);
Option<String> opt = getLemma(ta, i);
if (opt.isPresent()) {
Constituent c = new Constituent("", "", ta, i, i + 1);
c.addAttribute(CoNLLColumnFormatReader.LemmaIdentifier,
opt.get());
c.addAttribute(CoNLLColumnFormatReader.LemmaIdentifier, opt.get());
list.add(c);
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment