From 39993e18b5afcb80f570d7fc71fe116b074d18fb Mon Sep 17 00:00:00 2001 From: Christos Christodoulopoulos <christod@illinois.edu> Date: Wed, 30 Sep 2015 11:47:35 -0500 Subject: [PATCH] New config for pipeline --- config/lemmatizer-config.properties | 1 + config/ner-conll-config.properties | 2 +- config/ner-ontonotes-config.properties | 53 ++++++++++++++++++++++++++ config/pipeline-config.properties | 10 ----- config/srl-config.properties | 8 +++- run.sh | 11 ------ scripts/run-interactive.sh | 2 +- scripts/run.sh | 2 +- 8 files changed, 63 insertions(+), 26 deletions(-) create mode 100644 config/lemmatizer-config.properties create mode 100644 config/ner-ontonotes-config.properties delete mode 100644 config/pipeline-config.properties delete mode 100755 run.sh diff --git a/config/lemmatizer-config.properties b/config/lemmatizer-config.properties new file mode 100644 index 0000000..a8db3f6 --- /dev/null +++ b/config/lemmatizer-config.properties @@ -0,0 +1 @@ +useStanfordConvention true diff --git a/config/ner-conll-config.properties b/config/ner-conll-config.properties index 979a364..e77c4ce 100644 --- a/config/ner-conll-config.properties +++ b/config/ner-conll-config.properties @@ -33,7 +33,7 @@ PreviousTagPatternLevel2 1 AggregateContext 0 AggregateGazetteerMatches 0 PrevTagsForContext 1 -PredictionsLevel1 1 +PredictionsLevel1 0 # Feature groups BrownClusterPaths 1 diff --git a/config/ner-ontonotes-config.properties b/config/ner-ontonotes-config.properties new file mode 100644 index 0000000..dbb5ad1 --- /dev/null +++ b/config/ner-ontonotes-config.properties @@ -0,0 +1,53 @@ +# Ontonotes config file + +# Required fields +configFilename Ontonotes +pathToModelFile data/Models/Ontonotes +taggingEncodingScheme BILOU +tokenizationScheme DualTokenizationScheme + +# Optional fields +beamSize 5 +forceNewSentenceOnLineBreaks true +labelTypes TIME LAW GPE NORP LANGUAGE PERCENT FAC PRODUCT ORDINAL LOC PERSON WORK_OF_ART MONEY DATE EVENT QUANTITY ORG CARDINAL +logging false +# debuggingLogPath irrelevant +inferenceMethod GREEDY +normalizeTitleText false +pathToTokenNormalizationData brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt +predictionConfidenceThreshold -1 +sortLexicallyFilesInFolders true +thresholdPrediction false +treatAllFilesInFolderAsOneBigDocument false +debug true + +# Features +Forms 1 +Capitalization 1 +WordTypeInformation 1 +Affixes 1 +PreviousTag1 1 +PreviousTag2 1 +PreviousTagPatternLevel1 1 +PreviousTagPatternLevel2 1 +AggregateContext 0 +AggregateGazetteerMatches 0 +PrevTagsForContext 1 +PredictionsLevel1 0 + +# Feature groups +BrownClusterPaths 1 +isLowercaseBrownClusters false false false +pathsToBrownClusters brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt brown-clusters/brownBllipClusters brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt +minWordAppThresholdsForBrownClusters 5 5 5 + +GazetteersFeatures 1 +pathToGazetteersLists ner-ext/KnownLists + +WordEmbeddings 0 +# pathsToWordEmbeddings WordEmbedding/model-2280000000.LEARNING_RATE=1e-08.EMBEDDING_LEARNING_RATE=1e-07.EMBEDDING_SIZE=50.gz +# embeddingDimensionalities 50 +# minWordAppThresholdsForEmbeddings 0 +# normalizationConstantsForEmbeddings 1.0 +# normalizationMethodsForEmbeddings OVERALL +# isLowercaseWordEmbeddings false diff --git a/config/pipeline-config.properties b/config/pipeline-config.properties deleted file mode 100644 index 9f110cf..0000000 --- a/config/pipeline-config.properties +++ /dev/null @@ -1,10 +0,0 @@ -usePos true -useChunker true -useLemmatizer true -useNer true -useStanfordParse true -lemmaCacheFile data/lemmaCache.txt -updateLemmaCacheFile false -maxLemmaCacheEntries 10000 -wordnetPath data/WordNet -nerConfigFile config/ner-conll-config.properties diff --git a/config/srl-config.properties b/config/srl-config.properties index 4923336..ac94f90 100644 --- a/config/srl-config.properties +++ b/config/srl-config.properties @@ -5,6 +5,10 @@ curatorPort = 9010 # If set to true, it will force Curator to re-annotate the input curatorForceUpdate = false +nerConllConfig config/ner-conll-config.properties +nerOntonotesConfig config/ner-ontonotes-config.properties +lemmaConfig config/lemmatizer-config.properties + # If set to true, the output will be a Curator Record instead of a TextAnnotation. useRecords = false @@ -36,7 +40,7 @@ setCacheShutdownHook = true # Whether to use the Illinois Curator to get the required annotations for training/testing # If set to false, Illinois NLP pipeline will be used -UseCurator = true +UseCurator = false # The URL and host of Curator. If UseCurator is false, make sure you have pipeline config file set CuratorHost = trollope.cs.illinois.edu CuratorPort = 9010 @@ -48,7 +52,7 @@ PipelineConfigFile = config/pipeline-config.properties # The parser used to extract constituents and syntactic features # Options are: Charniak, Berkeley, Stanford # NB: Only Stanford can be used in standalone mode. -DefaultParser = Charniak +DefaultParser = Stanford WordNetConfig = jwnl_properties.xml diff --git a/run.sh b/run.sh deleted file mode 100755 index 290b4fb..0000000 --- a/run.sh +++ /dev/null @@ -1,11 +0,0 @@ -#mvn -q dependency:copy-dependencies -#mvn -q compile -# module load sun-jdk/1.8.0 -CP="./config/:./target/classes/:./target/dependency/*" - -OPTIONS="-Xss40m -ea -cp $CP" -PACKAGE_PREFIX="edu.illinois.cs.cogcomp" - -MAIN="$PACKAGE_PREFIX.srl.Main" - -time nice java $OPTIONS $MAIN $CONFIG_STR $* diff --git a/scripts/run-interactive.sh b/scripts/run-interactive.sh index 543c241..33c7a6f 100755 --- a/scripts/run-interactive.sh +++ b/scripts/run-interactive.sh @@ -1,5 +1,5 @@ #!/bin/bash -mvn compile +mvn clean compile mvn -q dependency:copy-dependencies CP=target/classes:config:target/dependency/* diff --git a/scripts/run.sh b/scripts/run.sh index bf1b4d9..500342b 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -1,5 +1,5 @@ #!/bin/bash -mvn compile +mvn clean compile mvn -q dependency:copy-dependencies CP=target/classes:config:target/dependency/* -- GitLab