public NonLinearCliquePotentialFunction(double[][] linearWeights, double[][] inputLayerWeights, double[][] outputLayerWeights, SeqClassifierFlags flags) { this.linearWeights = linearWeights; this.inputLayerWeights = inputLayerWeights; this.outputLayerWeights = outputLayerWeights; this.flags = flags; }
// = null; /// <summary> /// reads the tokenFactory and tokensAnnotationClassName from /// <see cref="SeqClassifierFlags"/> /// </summary> public virtual void Init(SeqClassifierFlags flags) { if (flags.tokensAnnotationClassName != null) { this.tokensAnnotationClassName = flags.tokensAnnotationClassName; } else { this.tokensAnnotationClassName = "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation"; } if (flags.tokenFactory != null) { try { this.tokenFactory = (ICoreTokenFactory <IN>)System.Activator.CreateInstance(Sharpen.Runtime.GetType(flags.tokenFactory)); } catch (Exception e) { throw new Exception(e); } } else { this.tokenFactory = (ICoreTokenFactory <IN>) new CoreLabelTokenFactory(); } Init(flags, this.tokenFactory, this.tokensAnnotationClassName); }
public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank <IList <In> > wrapped, ICollection <string> knownLCWords) : base(null, null) { this.flags = flags; this.wrapped = wrapped; this.knownLCWords = knownLCWords; }
// end enum Output style // = null; public virtual void Init(SeqClassifierFlags flags) { string options = "tokenizeNLs=false,invertible=true"; if (flags.tokenizerOptions != null) { options = options + ',' + flags.tokenizerOptions; } ITokenizerFactory <In> factory; if (flags.tokenizerFactory != null) { try { Type clazz = ErasureUtils.UncheckedCast(Sharpen.Runtime.GetType(flags.tokenizerFactory)); MethodInfo factoryMethod = clazz.GetMethod("newCoreLabelTokenizerFactory", typeof(string)); factory = ErasureUtils.UncheckedCast(factoryMethod.Invoke(null, options)); } catch (Exception e) { throw new Exception(e); } } else { factory = ErasureUtils.UncheckedCast(PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory(options)); } Init(flags, factory); }
public virtual string CheckDic(string c2, SeqClassifierFlags flags) { if (cd.GetW(c2).Equals("1")) { return("1"); } return("0"); }
public EmpiricalNERPriorBIO(string backgroundSymbol, IIndex <string> classIndex, IIndex <string> tagIndex, IList <IN> doc, Pair <double[][], double[][]> matrices, SeqClassifierFlags flags) : base(backgroundSymbol, classIndex, tagIndex, doc) { entityMatrix = matrices.First(); subEntityMatrix = matrices.Second(); this.flags = flags; ORGIndex = tagIndex.IndexOf("ORG"); LOCIndex = tagIndex.IndexOf("LOC"); }
public TagAffixDetector(SeqClassifierFlags flags) { // String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/"; string corporaDict; if (flags.sighanCorporaDict != null) { corporaDict = flags.sighanCorporaDict; } else { corporaDict = DefaultCorporaDict; } if (!corporaDict.IsEmpty() && !corporaDict.EndsWith("/")) { corporaDict = corporaDict + '/'; } string ccPath; string adPath; if (flags.useChPos || flags.useCTBChar2 || flags.usePKChar2) { // if we're using POS information, override the ccPath // For now we only have list for CTB and PK if (flags.useASBCChar2 || flags.useHKChar2 || flags.useMSRChar2) { throw new Exception("only support settings for CTB and PK now."); } else { if (flags.useCTBChar2) { ccPath = corporaDict + "dict/character_list"; adPath = corporaDict + "dict/in.ctb"; } else { if (flags.usePKChar2) { ccPath = corporaDict + "dict/pos_open/character_list.pku.utf8"; adPath = corporaDict + "dict/in.pk"; } else { throw new Exception("none of flags.useXXXChar2 are on"); } } } } else { ccPath = corporaDict + "dict/pos_close/char.ctb.list"; adPath = corporaDict + "dict/in.ctb"; } cc = new CorpusChar(ccPath); aD = new AffixDictionary(adPath); }
// Somewhat arbitrary flags. We're just picking flags that will execute the problematic code // path. private static SeqClassifierFlags CreateTestFlags() { SeqClassifierFlags flags = new SeqClassifierFlags(); flags.sighanPostProcessing = true; flags.usePk = true; flags.keepEnglishWhitespaces = false; flags.keepAllWhitespaces = false; return(flags); }
static void TrainAndWrite(string propPath, string modelPath) { Properties props = StringUtils.propFileToProperties(propPath); var flags = new SeqClassifierFlags(props); var crf = new CRFClassifier(flags); crf.train(); crf.serializeClassifier(modelPath); }
public NonLinearSecondOrderCliquePotentialFunction(double[][] inputLayerWeights4Edge, double[][] outputLayerWeights4Edge, double[][] inputLayerWeights, double[][] outputLayerWeights, SeqClassifierFlags flags) { // first index is number of hidden units in layer one, second index is the input feature indices // first index is the output class, second index is the number of hidden units // first index is number of hidden units in layer one, second index is the input feature indices // first index is the output class, second index is the number of hidden units this.inputLayerWeights4Edge = inputLayerWeights4Edge; this.outputLayerWeights4Edge = outputLayerWeights4Edge; this.inputLayerWeights = inputLayerWeights; this.outputLayerWeights = outputLayerWeights; this.flags = flags; }
/// <summary>Make an Arabic Segmenter.</summary> /// <param name="props"> /// Options for how to tokenize. See the main method of /// <see cref="ArabicTokenizer{T}"/> /// for details /// </param> public ArabicSegmenter(Properties props) { /* Serializable */ // SEGMENTER OPTIONS (can be set in the Properties object // passed to the constructor). // The input already been tokenized. Do not run the Arabic tokenizer. // Tokenizer options // Mark segmented prefixes with this String // Mark segmented suffixes with this String // Number of decoding threads // Write TedEval files // Use a custom feature factory // Training and evaluation files have domain labels // Training and evaluation text are all in the same domain (default:atb) // Ignore rewrites (training only, produces a model that then can be used to do // no-rewrite segmentation) // Use the original feature set which doesn't contain start-and-end "wrapper" features isTokenized = props.Contains(optTokenized); tokenizerOptions = props.GetProperty(optTokenizer, null); tedEvalPrefix = props.GetProperty(optTedEval, null); hasDomainLabels = props.Contains(optWithDomains); domain = props.GetProperty(optDomain, "atb"); noRewrites = props.Contains(optNoRewrites); tf = GetTokenizerFactory(); prefixMarker = props.GetProperty(optPrefix, string.Empty); suffixMarker = props.GetProperty(optSuffix, string.Empty); if (props.Contains(optLocalFeaturesOnly)) { if (props.Contains(optFeatureFactory)) { throw new Exception("Cannot use custom feature factory with localFeaturesOnly flag--" + "have your custom feature factory extend ArabicSegmenterFeatureFactory instead of " + "StartAndEndArabicSegmenterFeatureFactory and remove the localFeaturesOnly flag." ); } props.SetProperty(optFeatureFactory, localOnlyFeatureFactory); } if (!props.Contains(optFeatureFactory)) { props.SetProperty(optFeatureFactory, defaultFeatureFactory); } // Remove all command-line properties that are specific to ArabicSegmenter props.Remove(optTokenizer); props.Remove(optTokenized); props.Remove(optPrefix); props.Remove(optSuffix); props.Remove(optThreads); props.Remove(optTedEval); props.Remove(optWithDomains); props.Remove(optDomain); props.Remove(optNoRewrites); props.Remove(optLocalFeaturesOnly); flags = new SeqClassifierFlags(props); classifier = new CRFClassifier <CoreLabel>(flags); }
/// <summary>Copy constructor.</summary> /// <param name="other"/> public ArabicSegmenter(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter other) { isTokenized = other.isTokenized; tokenizerOptions = other.tokenizerOptions; prefixMarker = other.prefixMarker; suffixMarker = other.suffixMarker; tedEvalPrefix = other.tedEvalPrefix; hasDomainLabels = other.hasDomainLabels; domain = other.domain; noRewrites = other.noRewrites; flags = other.flags; // ArabicTokenizerFactory is *not* threadsafe. Make a new copy. tf = GetTokenizerFactory(); // CRFClassifier is threadsafe, so return a reference. classifier = other.classifier; }
public NonDict2(SeqClassifierFlags flags) { //public String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/"; if (cd == null) { if (flags.sighanCorporaDict != null) { corporaDict = flags.sighanCorporaDict; } // use the same flag for Sighan 2005, // but our list is extracted from ctb string path; if (flags.useAs || flags.useHk || flags.useMsr) { throw new Exception("only support settings for CTB and PKU now."); } else { if (flags.usePk) { path = corporaDict + "/dict/pku.non"; } else { // CTB path = corporaDict + "/dict/ctb.non"; } } cd = new CorpusDictionary(path); // just output the msg... if (flags.useAs || flags.useHk || flags.useMsr) { } else { if (flags.usePk) { logger.Info("INFO: flags.usePk=true | building NonDict2 from " + path); } else { // CTB logger.Info("INFO: flags.usePk=false | building NonDict2 from " + path); } } } }
public void createModelFromTrainingData(string inputPath, string outputPath, string properties) { Properties props = edu.stanford.nlp.util.StringUtils.propFileToProperties(properties); props.setProperty("serializeTo", outputPath); if (inputPath != null) { props.setProperty("trainFile", inputPath); } SeqClassifierFlags flags = new SeqClassifierFlags(props); CRFClassifier crf = new CRFClassifier(flags); crf.train(); crf.serializeClassifier(outputPath); }
public virtual void TestUsingIterator() { string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n"; string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." }; string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." }; NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length"); Properties props = PropertiesUtils.AsProperties("wordShape", "chris2"); SeqClassifierFlags flags = new SeqClassifierFlags(props); PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>(); readerAndWriter.Init(flags); ReaderIteratorFactory rif = new ReaderIteratorFactory(new StringReader(s)); ObjectBank <IList <CoreLabel> > di = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter); ICollection <string> knownLCWords = new HashSet <string>(); ObjectBankWrapper <CoreLabel> obw = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords); try { int outIdx = 0; for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();) { IList <CoreLabel> sent = iter.Current; for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();) { CoreLabel cl = iter2.Current; string tok = cl.Word(); string shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation)); NUnit.Framework.Assert.AreEqual(output[outIdx], tok); NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape); outIdx++; } } if (outIdx < output.Length) { NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]); } } catch (Exception e) { NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e); } }
public virtual void TestMultithreadedCombineSegmentedSentence() { SeqClassifierFlags flags = CreateTestFlags(); IList <CoreLabel> labels = CreateTestTokens(); IList <IFuture <bool> > tasks = new List <IFuture <bool> >(Threads); IExecutorService executor = Executors.NewFixedThreadPool(Threads); for (int v = 0; v < Threads; v++) { IFuture <bool> f = executor.Submit(null); tasks.Add(f); } foreach (IFuture <bool> task in tasks) { // This assert will fail by throwing a propagated exception, if exceptions due to // multithreading issues (generally NPEs) were thrown during the test. System.Diagnostics.Debug.Assert((task.Get())); } }
internal CRFNonLinearSecondOrderLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, IIndex <string> classIndex, IList <IIndex <CRFLabel> > labelIndices, int[] map, int prior, SeqClassifierFlags flags, int numNodeFeatures , int numEdgeFeatures) { this.window = window; this.classIndex = classIndex; this.numClasses = classIndex.Size(); this.labelIndices = labelIndices; this.data = data; this.flags = flags; this.map = map; this.labels = labels; this.prior = prior; this.backgroundSymbol = flags.backgroundSymbol; this.sigma = flags.sigma; this.outputLayerSize = numClasses; this.outputLayerSize4Edge = numClasses * numClasses; this.numHiddenUnits = flags.numHiddenUnits; this.inputLayerSize = numHiddenUnits * numClasses; this.inputLayerSize4Edge = numHiddenUnits * numClasses * numClasses; this.numNodeFeatures = numNodeFeatures; this.numEdgeFeatures = numEdgeFeatures; this.useOutputLayer = flags.useOutputLayer; this.useHiddenLayer = flags.useHiddenLayer; this.useSigmoid = flags.useSigmoid; this.docWindowLabels = new int[data.Length][]; if (!useOutputLayer) { log.Info("Output layer not activated, inputLayerSize must be equal to numClasses, setting it to " + numClasses); this.inputLayerSize = numClasses; this.inputLayerSize4Edge = numClasses * numClasses; } else { if (flags.softmaxOutputLayer && !(flags.sparseOutputLayer || flags.tieOutputLayer)) { throw new Exception("flags.softmaxOutputLayer == true, but neither flags.sparseOutputLayer or flags.tieOutputLayer is true"); } } }
public virtual void Init(SeqClassifierFlags flags) { this.flags = flags; factory = LineIterator.GetFactory(new Sighan2005DocumentReaderAndWriter.CTBDocumentParser(this)); // pichuan : flags.normalizationTable is null --> i believe this is replaced by some java class?? // (Thu Apr 24 11:10:42 2008) cdtos = new ChineseDocumentToSentenceProcessor(flags.normalizationTable); if (flags.dictionary != null) { string[] dicts = flags.dictionary.Split(","); cdict = new ChineseDictionary(dicts, cdtos, flags.expandMidDot); } if (flags.serializedDictionary != null) { string dict = flags.serializedDictionary; cdict = new ChineseDictionary(dict, cdtos, flags.expandMidDot); } if (flags.dictionary2 != null) { string[] dicts2 = flags.dictionary2.Split(","); cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot); } }
/// <summary> /// post process the answer to be output /// these post processing are not dependent on original input /// </summary> private static string PostProcessingAnswer(string ans, SeqClassifierFlags flags) { if (flags.useHk) { //logger.info("Using HK post processing."); return(hkPostProcessor.PostProcessingAnswer(ans)); } else { if (flags.useAs) { //logger.info("Using AS post processing."); return(asPostProcessor.PostProcessingAnswer(ans)); } else { if (flags.usePk) { //logger.info("Using PK post processing."); return(pkPostProcessor.PostProcessingAnswer(ans, flags.keepAllWhitespaces)); } else { if (flags.useMsr) { //logger.info("Using MSR post processing."); return(basicPostsProcessor.PostProcessingAnswer(ans)); } else { //logger.info("Using CTB post processing."); return(ctpPostProcessor.PostProcessingAnswer(ans, flags.suppressMidDotPostprocessing)); } } } } }
public virtual IListeningSequenceModel GetInstance(string backgroundSymbol, IIndex <string> classIndex, IIndex <string> tagIndex, IList <In> document, Pair <double[][], double[][]> entityMatrices, SeqClassifierFlags flags) { EntityCachingAbstractSequencePriorBIO <In> prior = new EmpiricalNERPriorBIO <In>(flags.backgroundSymbol, classIndex, tagIndex, document, entityMatrices, flags); return(prior); }
public CRFClassifierNonlinear(SeqClassifierFlags flags) : base(flags) { }
public override void Init(SeqClassifierFlags flags) { base.Init(flags); }
// run a particular CRF of this ClassifierCombiner on a testFile // user can say -crfToExamine 0 to get 1st element or -crfToExamine /edu/stanford/models/muc7.crf.ser.gz // this does not currently support drill down on CMM's /// <exception cref="System.Exception"/> public static void ExamineCRF(Edu.Stanford.Nlp.IE.ClassifierCombiner cc, string crfNameOrIndex, SeqClassifierFlags flags, string testFile, string testFiles, IDocumentReaderAndWriter <CoreLabel> readerAndWriter) { CRFClassifier <CoreLabel> crf; // potential index into baseClassifiers int ci; // set ci with the following rules // 1. first see if ci is an index into baseClassifiers // 2. if its not an integer or wrong size, see if its a file name of a loadPath try { ci = System.Convert.ToInt32(crfNameOrIndex); if (ci < 0 || ci >= cc.baseClassifiers.Count) { // ci is not an int corresponding to an element in baseClassifiers, see if name of a crf loadPath ci = cc.initLoadPaths.IndexOf(crfNameOrIndex); } } catch (NumberFormatException) { // cannot interpret crfNameOrIndex as an integer, see if name of a crf loadPath ci = cc.initLoadPaths.IndexOf(crfNameOrIndex); } // if ci corresponds to an index in baseClassifiers, get the crf at that index, otherwise set crf to null if (ci >= 0 && ci < cc.baseClassifiers.Count) { // TODO: this will break if baseClassifiers contains something that is not a CRF crf = (CRFClassifier <CoreLabel>)cc.baseClassifiers[ci]; } else { crf = null; } // if you can get a specific crf, generate the appropriate report, if null do nothing if (crf != null) { // if there is a crf and testFile was set , do the crf stuff for a single testFile if (testFile != null) { if (flags.searchGraphPrefix != null) { crf.ClassifyAndWriteViterbiSearchGraph(testFile, flags.searchGraphPrefix, crf.MakeReaderAndWriter()); } else { if (flags.printFirstOrderProbs) { crf.PrintFirstOrderProbs(testFile, readerAndWriter); } else { if (flags.printFactorTable) { crf.PrintFactorTable(testFile, readerAndWriter); } else { if (flags.printProbs) { crf.PrintProbs(testFile, readerAndWriter); } else { if (flags.useKBest) { // TO DO: handle if user doesn't provide kBest int k = flags.kBest; crf.ClassifyAndWriteAnswersKBest(testFile, k, readerAndWriter); } else { if (flags.printLabelValue) { crf.PrintLabelInformation(testFile, readerAndWriter); } else { // no crf test flag provided log.Info("Warning: no crf test flag was provided, running classify and write answers"); crf.ClassifyAndWriteAnswers(testFile, readerAndWriter, true); } } } } } } } else { if (testFiles != null) { // if there is a crf and testFiles was set , do the crf stuff for testFiles // if testFile was set as well, testFile overrides IList <File> files = Arrays.Stream(testFiles.Split(",")).Map(null).Collect(Collectors.ToList()); if (flags.printProbs) { // there is a crf and printProbs crf.PrintProbs(files, crf.DefaultReaderAndWriter()); } else { log.Info("Warning: no crf test flag was provided, running classify files and write answers"); crf.ClassifyFilesAndWriteAnswers(files, crf.DefaultReaderAndWriter(), true); } } } } }
public static IDictionary <string, DataInstance> ParseColumnFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix) { CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter(); Properties props = new Properties(); SeqClassifierFlags flags = new SeqClassifierFlags(props); flags.entitySubclassification = "noprefix"; flags.retainEntitySubclassification = false; conllreader.Init(flags); IEnumerator <IList <CoreLabel> > dociter = conllreader.GetIterator(reader); int num = -1; IDictionary <string, DataInstance> sents = new Dictionary <string, DataInstance>(); while (dociter.MoveNext()) { IList <CoreLabel> doc = dociter.Current; IList <string> words = new List <string>(); IList <CoreLabel> sentcore = new List <CoreLabel>(); int tokenindex = 0; foreach (CoreLabel l in doc) { if (l.Word().Equals(CoNLLDocumentReaderAndWriter.Boundary) || l.Word().Equals("-DOCSTART-")) { if (words.Count > 0) { num++; string docid = sentIDprefix + "-" + num.ToString(); DataInstance sentInst = DataInstance.GetNewSurfaceInstance(sentcore); sents[docid] = sentInst; words = new List <string>(); sentcore = new List <CoreLabel>(); tokenindex = 0; } continue; } tokenindex++; words.Add(l.Word()); l.Set(typeof(CoreAnnotations.IndexAnnotation), tokenindex); l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word()); string label = l.Get(typeof(CoreAnnotations.AnswerAnnotation)); System.Diagnostics.Debug.Assert(label != null, "label cannot be null"); l.Set(typeof(CoreAnnotations.TextAnnotation), l.Word()); l.Set(typeof(CoreAnnotations.OriginalTextAnnotation), l.Word()); if (setGoldClass) { l.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); } if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label)) { l.Set(setClassForTheseLabels[label], label); } sentcore.Add(l); } if (words.Count > 0) { num++; string docid = sentIDprefix + "-" + num.ToString(); DataInstance sentInst = DataInstance.GetNewSurfaceInstance(sentcore); sents[docid] = sentInst; } } return(sents); }
public virtual void Init(SeqClassifierFlags flags) { this.flags = flags; factory = XMLBeginEndIterator.GetFactory("DOC", new MUCDocumentReaderAndWriter.MUCDocumentParser(), true, true); }
public CRFClassifierNoisyLabel(SeqClassifierFlags flags) : base(flags) { }
internal CRFNonLinearSecondOrderLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, IIndex <string> classIndex, IList <IIndex <CRFLabel> > labelIndices, int[] map, SeqClassifierFlags flags, int numNodeFeatures, int numEdgeFeatures ) : this(data, labels, window, classIndex, labelIndices, map, QuadraticPrior, flags, numNodeFeatures, numEdgeFeatures) { }
public static void Main(string[] args) { Properties props = StringUtils.ArgsToProperties(args); // logger.debug(props.toString()); SeqClassifierFlags flags = new SeqClassifierFlags(props); MaxMatchSegmenter seg = new MaxMatchSegmenter(); string lexiconFile = props.GetProperty("lexicon"); if (lexiconFile != null) { seg.AddLexicon(lexiconFile); } else { logger.Error("Error: no lexicon file!"); System.Environment.Exit(1); } Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter(); sighanRW.Init(flags); BufferedReader br = new BufferedReader(new InputStreamReader(Runtime.@in)); PrintWriter stdoutW = new PrintWriter(System.Console.Out); int lineNb = 0; for (; ;) { ++lineNb; logger.Info("line: " + lineNb); try { string line = br.ReadLine(); if (line == null) { break; } string outputLine = null; if (props.GetProperty("greedy") != null) { List <Word> sentence = seg.GreedilySegmentWords(line); outputLine = SentenceUtils.ListToString(sentence); } else { if (props.GetProperty("maxwords") != null) { seg.BuildSegmentationLattice(line); outputLine = SentenceUtils.ListToString(seg.SegmentWords(MaxMatchSegmenter.MatchHeuristic.Maxwords)); } else { seg.BuildSegmentationLattice(line); outputLine = SentenceUtils.ListToString(seg.MaxMatchSegmentation()); } } StringReader strR = new StringReader(outputLine); IEnumerator <IList <CoreLabel> > itr = sighanRW.GetIterator(strR); while (itr.MoveNext()) { sighanRW.PrintAnswers(itr.Current, stdoutW); } } catch (IOException) { // System.out.println(outputLine); break; } } stdoutW.Flush(); }
/// <summary>Required, but unused.</summary> public virtual void Init(SeqClassifierFlags flags) { }
public static string CombineSegmentedSentence(IList <CoreLabel> doc, SeqClassifierFlags flags) { // Hey all: Some of the code that was previously here for // whitespace normalization was a bit hackish as well as // obviously broken for some test cases. So...I went ahead and // re-wrote it. // // Also, putting everything into 'testContent', is a bit wasteful // memory wise. But, it's on my near-term todo list to // code something that's a bit more memory efficient. // // Finally, if these changes ended up breaking anything // just e-mail me ([email protected]), and I'll try to fix it // asap -cer (6/14/2006) /* Sun Oct 7 19:55:09 2007 * I'm actually not using "testContent" anymore. * I think it's broken because the whole test file has been read over and over again, * tand the testContentIdx has been set to 0 every time, while "doc" is moving * line by line!!!! * -pichuan */ int testContentIdx = 0; StringBuilder ans = new StringBuilder(); // the actual output we will return StringBuilder unmod_ans = new StringBuilder(); // this is the original output from the CoreLabel StringBuilder unmod_normed_ans = new StringBuilder(); // this is the original output from the CoreLabel CoreLabel wi = null; for (IEnumerator <CoreLabel> wordIter = doc.GetEnumerator(); wordIter.MoveNext(); testContentIdx++) { CoreLabel pwi = wi; wi = wordIter.Current; bool originalWhiteSpace = "1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation))); // if the CRF says "START" (segmented), and it's not the first word.. if (wi.Get(typeof(CoreAnnotations.AnswerAnnotation)).Equals("1") && !("0".Equals(wi.Get(typeof(CoreAnnotations.PositionAnnotation)).ToString()))) { // check if we need to preserve the "no space" between English // characters bool seg = true; // since it's in the "1" condition.. default is to seg if (flags.keepEnglishWhitespaces) { if (testContentIdx > 0) { char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; if (IsLetterASCII(prevChar) && IsLetterASCII(currChar)) { // keep the "non space" before wi if (!originalWhiteSpace) { seg = false; } } } } // if there was space and keepAllWhitespaces is true, restore it no matter what if (flags.keepAllWhitespaces && originalWhiteSpace) { seg = true; } if (seg) { if (originalWhiteSpace) { ans.Append('\u1924'); } else { // a pretty Limbu character which is later changed to a space ans.Append(' '); } } unmod_ans.Append(' '); unmod_normed_ans.Append(' '); } else { bool seg = false; // since it's in the "0" condition.. default // Changed after conversation with Huihsin. // // Decided that all words consisting of English/ASCII characters // should be separated from the surrounding Chinese characters. -cer /* Sun Oct 7 22:14:46 2007 (pichuan) * the comment above was from DanC. * I changed the code but I think I'm doing the same thing here. */ if (testContentIdx > 0) { char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; if ((prevChar < (char)128) != (currChar < (char)128)) { if (ChineseUtils.IsNumber(prevChar) && ChineseUtils.IsNumber(currChar)) { } else { // cdm: you would get here if you had an ASCII number next to a // Unihan range number. Does that happen? It presumably // shouldn't do any harm.... [cdm, oct 2007] if (flags.separateASCIIandRange) { seg = true; } } } } if (flags.keepEnglishWhitespaces) { if (testContentIdx > 0) { char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; if (IsLetterASCII(prevChar) && IsLetterASCII(currChar) || IsLetterASCII(prevChar) && ChineseUtils.IsNumber(currChar) || ChineseUtils.IsNumber(prevChar) && IsLetterASCII(currChar)) { // keep the "space" before wi if ("1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation)))) { seg = true; } } } } // if there was space and keepAllWhitespaces is true, restore it no matter what if (flags.keepAllWhitespaces) { if (!("0".Equals(wi.Get(typeof(CoreAnnotations.PositionAnnotation)).ToString())) && "1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation)))) { seg = true; } } if (seg) { if (originalWhiteSpace) { ans.Append('\u1924'); } else { // a pretty Limbu character which is later changed to a space ans.Append(' '); } } } ans.Append(wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))); unmod_ans.Append(wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))); unmod_normed_ans.Append(wi.Get(typeof(CoreAnnotations.CharAnnotation))); } string ansStr = ans.ToString(); if (flags.sighanPostProcessing) { if (!flags.keepAllWhitespaces) { // remove the Limbu char now, so it can be deleted in postprocessing ansStr = ansStr.ReplaceAll("\u1924", " "); } ansStr = PostProcessingAnswer(ansStr, flags); } // definitely remove the Limbu char if it survived till now ansStr = ansStr.ReplaceAll("\u1924", " "); return(ansStr); }