/* * public ChineseDocumentToSentenceProcessor(String normalizationTableFile, String encoding) { * log.info("WARNING: ChineseDocumentToSentenceProcessor ignores normalizationTableFile argument!"); * log.info("WARNING: ChineseDocumentToSentenceProcessor ignores encoding argument!"); * // encoding is never read locally * this.encoding = encoding; * } */ /// <summary> /// This should now become disused, and other people should call /// ChineseUtils directly! CDM June 2006. /// </summary> public virtual string Normalization(string @in) { //log.info("BEFOR NORM: "+in); string norm = ChineseUtils.Normalize(@in); string @out = Normalize(norm); //log.info("AFTER NORM: "+out); return(@out); }
public virtual void TestNormalize() { string input = "Hello English - 你好\u3000班汉·西巴阿差\u3000Chris•Manning \uD83E\uDD16\uD83E\uDD16robot"; string outputLLL = "Hello English - 你好\u3000班汉·西巴阿差\u3000Chris•Manning \uD83E\uDD16\uD83E\uDD16robot"; string outputAAN = "Hello English - 你好 班汉·西巴阿差 Chris·Manning \uD83E\uDD16\uD83E\uDD16robot"; string outputFFF = "Hello\u3000\u3000English\u3000-\u3000你好 班汉・西巴阿差 Chris・Manning\u3000\uD83E\uDD16\uD83E\uDD16robot"; NUnit.Framework.Assert.AreEqual(outputLLL, ChineseUtils.Normalize(input, ChineseUtils.Leave, ChineseUtils.Leave, ChineseUtils.Leave)); NUnit.Framework.Assert.AreEqual(outputAAN, ChineseUtils.Normalize(input, ChineseUtils.Ascii, ChineseUtils.Ascii, ChineseUtils.Normalize)); NUnit.Framework.Assert.AreEqual(outputFFF, ChineseUtils.Normalize(input, ChineseUtils.Fullwidth, ChineseUtils.Fullwidth, ChineseUtils.Fullwidth)); }
private static ICollection <string> ReadDict(string filename, bool normalize) { ICollection <string> word = Generics.NewHashSet(); logger.Info("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename); try { using (InputStream @is = IOUtils.GetInputStreamFromURLOrClasspathOrFileSystem(filename)) { BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(@is, "UTF-8")); int i = 0; for (string wordDetectorLine; (wordDetectorLine = wordDetectorReader.ReadLine()) != null;) { i++; //String[] fields = wordDetectorLine.split(" "); //logger.debug("DEBUG: "+filename+" "+wordDetectorLine); int origLeng = wordDetectorLine.Length; wordDetectorLine = wordDetectorLine.Trim(); int newLeng = wordDetectorLine.Length; if (newLeng != origLeng) { EncodingPrintWriter.Err.Println("Line " + i + " of " + filename + " has leading/trailing whitespace: |" + wordDetectorLine + "|", "UTF-8"); } if (newLeng == 0) { EncodingPrintWriter.Err.Println("Line " + i + " of " + filename + " is empty", "UTF-8"); } else { if (normalize) { wordDetectorLine = ChineseUtils.Normalize(wordDetectorLine, ChineseUtils.Ascii, ChineseUtils.Ascii, ChineseUtils.Normalize); } word.Add(wordDetectorLine); } } } } catch (IOException e) { throw new RuntimeIOException(e); } return(word); }
/// <summary> /// Recursively builds an answer lattice (Chinese words) from a Viterbi search graph /// of binary predictions. /// </summary> /// <remarks> /// Recursively builds an answer lattice (Chinese words) from a Viterbi search graph /// of binary predictions. This function does a limited amount of post-processing: /// preserve white spaces of the input, and not segment between two latin characters or /// between two digits. Consequently, the probabilities of all paths in answerLattice /// may not sum to 1 (they do sum to 1 if no post processing applies). /// </remarks> /// <param name="tSource">Current node in Viterbi search graph.</param> /// <param name="aSource">Current node in answer lattice.</param> /// <param name="answer">Partial word starting at aSource.</param> /// <param name="nodeId">Currently unused node identifier for answer graph.</param> /// <param name="pos">Current position in docArray.</param> /// <param name="cost">Current cost of answer.</param> /// <param name="stateLinks"> /// Maps nodes of the search graph to nodes in answer lattice /// (when paths of the search graph are recombined, paths of the answer lattice should be /// recombined as well, if at word boundary). /// </param> private void TagLatticeToAnswerLattice(DFSAState <string, int> tSource, DFSAState <string, int> aSource, StringBuilder answer, MutableInteger nodeId, int pos, double cost, IDictionary <DFSAState <string, int>, DFSAState <string, int> > stateLinks, DFSA <string, int> answerLattice, CoreLabel[] docArray) { // Add "1" prediction after the end of the sentence, if applicable: if (tSource.IsAccepting() && tSource.ContinuingInputs().IsEmpty()) { tSource.AddTransition(new DFSATransition <string, int>(string.Empty, tSource, new DFSAState <string, int>(-1, null), "1", string.Empty, 0)); } // Get current label, character, and prediction: CoreLabel curLabel = (pos < docArray.Length) ? docArray[pos] : null; string curChr = null; string origSpace = null; if (curLabel != null) { curChr = curLabel.Get(typeof(CoreAnnotations.OriginalCharAnnotation)); System.Diagnostics.Debug.Assert((curChr.Length == 1)); origSpace = curLabel.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation)); } // Get set of successors in search graph: ICollection <string> inputs = tSource.ContinuingInputs(); // Only keep most probable transition out of initial state: string answerConstraint = null; if (pos == 0) { double minCost = double.PositiveInfinity; // DFSATransition<String, Integer> bestTransition = null; foreach (string predictSpace in inputs) { DFSATransition <string, int> transition = tSource.Transition(predictSpace); double transitionCost = transition.Score(); if (transitionCost < minCost) { if (predictSpace != null) { logger.Info(string.Format("mincost (%s): %e -> %e%n", predictSpace, minCost, transitionCost)); minCost = transitionCost; answerConstraint = predictSpace; } } } } // Follow along each transition: foreach (string predictSpace_1 in inputs) { DFSATransition <string, int> transition = tSource.Transition(predictSpace_1); DFSAState <string, int> tDest = transition.Target(); DFSAState <string, int> newASource = aSource; //logger.info(String.format("tsource=%s tdest=%s asource=%s pos=%d predictSpace=%s%n", tSource, tDest, newASource, pos, predictSpace)); StringBuilder newAnswer = new StringBuilder(answer.ToString()); int answerLen = newAnswer.Length; string prevChr = (answerLen > 0) ? newAnswer.Substring(answerLen - 1) : null; double newCost = cost; // Ignore paths starting with zero: if (answerConstraint != null && !answerConstraint.Equals(predictSpace_1)) { logger.Info(string.Format("Skipping transition %s at pos 0.%n", predictSpace_1)); continue; } // Ignore paths not consistent with input segmentation: if (flags.keepAllWhitespaces && "0".Equals(predictSpace_1) && "1".Equals(origSpace)) { logger.Info(string.Format("Skipping non-boundary at pos %d, since space in the input.%n", pos)); continue; } // Ignore paths adding segment boundaries between two latin characters, or between two digits: // (unless already present in original input) if ("1".Equals(predictSpace_1) && "0".Equals(origSpace) && prevChr != null && curChr != null) { char p = prevChr[0]; char c = curChr[0]; if (ChineseStringUtils.IsLetterASCII(p) && ChineseStringUtils.IsLetterASCII(c)) { logger.Info(string.Format("Not hypothesizing a boundary at pos %d, since between two ASCII letters (%s and %s).%n", pos, prevChr, curChr)); continue; } if (ChineseUtils.IsNumber(p) && ChineseUtils.IsNumber(c)) { logger.Info(string.Format("Not hypothesizing a boundary at pos %d, since between two numeral characters (%s and %s).%n", pos, prevChr, curChr)); continue; } } // If predictSpace==1, create a new transition in answer search graph: if ("1".Equals(predictSpace_1)) { if (newAnswer.ToString().Length > 0) { // If answer destination node visited before, create a new edge and leave: if (stateLinks.Contains(tSource)) { DFSAState <string, int> aDest = stateLinks[tSource]; newASource.AddTransition(new DFSATransition <string, int>(string.Empty, newASource, aDest, newAnswer.ToString(), string.Empty, newCost)); //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n", newASource, aDest, newAnswer)); continue; } // If answer destination node not visited before, create it + new edge: nodeId.IncValue(1); DFSAState <string, int> aDest_1 = new DFSAState <string, int>(nodeId, answerLattice, 0.0); stateLinks[tSource] = aDest_1; newASource.AddTransition(new DFSATransition <string, int>(string.Empty, newASource, aDest_1, newAnswer.ToString(), string.Empty, newCost)); //logger.info(String.format("new edge: adest=%s%n", newASource, aDest, newAnswer)); //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n%n%n", newASource, aDest, newAnswer)); // Reached an accepting state: if (tSource.IsAccepting()) { aDest_1.SetAccepting(true); continue; } // Start new answer edge: newASource = aDest_1; newAnswer = new StringBuilder(); newCost = 0.0; } } System.Diagnostics.Debug.Assert((curChr != null)); newAnswer.Append(curChr); newCost += transition.Score(); if (newCost < flags.searchGraphPrune || ChineseStringUtils.IsLetterASCII(curChr[0])) { TagLatticeToAnswerLattice(tDest, newASource, newAnswer, nodeId, pos + 1, newCost, stateLinks, answerLattice, docArray); } } }
public static string CombineSegmentedSentence(IList <CoreLabel> doc, SeqClassifierFlags flags) { // Hey all: Some of the code that was previously here for // whitespace normalization was a bit hackish as well as // obviously broken for some test cases. So...I went ahead and // re-wrote it. // // Also, putting everything into 'testContent', is a bit wasteful // memory wise. But, it's on my near-term todo list to // code something that's a bit more memory efficient. // // Finally, if these changes ended up breaking anything // just e-mail me ([email protected]), and I'll try to fix it // asap -cer (6/14/2006) /* Sun Oct 7 19:55:09 2007 * I'm actually not using "testContent" anymore. * I think it's broken because the whole test file has been read over and over again, * tand the testContentIdx has been set to 0 every time, while "doc" is moving * line by line!!!! * -pichuan */ int testContentIdx = 0; StringBuilder ans = new StringBuilder(); // the actual output we will return StringBuilder unmod_ans = new StringBuilder(); // this is the original output from the CoreLabel StringBuilder unmod_normed_ans = new StringBuilder(); // this is the original output from the CoreLabel CoreLabel wi = null; for (IEnumerator <CoreLabel> wordIter = doc.GetEnumerator(); wordIter.MoveNext(); testContentIdx++) { CoreLabel pwi = wi; wi = wordIter.Current; bool originalWhiteSpace = "1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation))); // if the CRF says "START" (segmented), and it's not the first word.. if (wi.Get(typeof(CoreAnnotations.AnswerAnnotation)).Equals("1") && !("0".Equals(wi.Get(typeof(CoreAnnotations.PositionAnnotation)).ToString()))) { // check if we need to preserve the "no space" between English // characters bool seg = true; // since it's in the "1" condition.. default is to seg if (flags.keepEnglishWhitespaces) { if (testContentIdx > 0) { char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; if (IsLetterASCII(prevChar) && IsLetterASCII(currChar)) { // keep the "non space" before wi if (!originalWhiteSpace) { seg = false; } } } } // if there was space and keepAllWhitespaces is true, restore it no matter what if (flags.keepAllWhitespaces && originalWhiteSpace) { seg = true; } if (seg) { if (originalWhiteSpace) { ans.Append('\u1924'); } else { // a pretty Limbu character which is later changed to a space ans.Append(' '); } } unmod_ans.Append(' '); unmod_normed_ans.Append(' '); } else { bool seg = false; // since it's in the "0" condition.. default // Changed after conversation with Huihsin. // // Decided that all words consisting of English/ASCII characters // should be separated from the surrounding Chinese characters. -cer /* Sun Oct 7 22:14:46 2007 (pichuan) * the comment above was from DanC. * I changed the code but I think I'm doing the same thing here. */ if (testContentIdx > 0) { char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; if ((prevChar < (char)128) != (currChar < (char)128)) { if (ChineseUtils.IsNumber(prevChar) && ChineseUtils.IsNumber(currChar)) { } else { // cdm: you would get here if you had an ASCII number next to a // Unihan range number. Does that happen? It presumably // shouldn't do any harm.... [cdm, oct 2007] if (flags.separateASCIIandRange) { seg = true; } } } } if (flags.keepEnglishWhitespaces) { if (testContentIdx > 0) { char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0]; if (IsLetterASCII(prevChar) && IsLetterASCII(currChar) || IsLetterASCII(prevChar) && ChineseUtils.IsNumber(currChar) || ChineseUtils.IsNumber(prevChar) && IsLetterASCII(currChar)) { // keep the "space" before wi if ("1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation)))) { seg = true; } } } } // if there was space and keepAllWhitespaces is true, restore it no matter what if (flags.keepAllWhitespaces) { if (!("0".Equals(wi.Get(typeof(CoreAnnotations.PositionAnnotation)).ToString())) && "1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation)))) { seg = true; } } if (seg) { if (originalWhiteSpace) { ans.Append('\u1924'); } else { // a pretty Limbu character which is later changed to a space ans.Append(' '); } } } ans.Append(wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))); unmod_ans.Append(wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))); unmod_normed_ans.Append(wi.Get(typeof(CoreAnnotations.CharAnnotation))); } string ansStr = ans.ToString(); if (flags.sighanPostProcessing) { if (!flags.keepAllWhitespaces) { // remove the Limbu char now, so it can be deleted in postprocessing ansStr = ansStr.ReplaceAll("\u1924", " "); } ansStr = PostProcessingAnswer(ansStr, flags); } // definitely remove the Limbu char if it survived till now ansStr = ansStr.ReplaceAll("\u1924", " "); return(ansStr); }
/// <exception cref="System.IO.IOException"/> public static IList <string> FromPlainText(string contentString, bool segmented) { if (segmented) { contentString = ChineseUtils.Normalize(contentString, ChineseUtils.Leave, ChineseUtils.Ascii); } else { contentString = ChineseUtils.Normalize(contentString, ChineseUtils.Fullwidth, ChineseUtils.Ascii); } string sentenceString = string.Empty; char[] content = contentString.ToCharArray(); bool sentenceEnd = false; IList <string> sentenceList = new List <string>(); int lastCh = -1; foreach (char c in content) { // EncodingPrintWriter.out.println("Char is |" + c + "|", "UTF-8"); string newChar = c.ToString(); if (!sentenceEnd) { if (segmented && fullStopsSet.Contains(c) && (lastCh == -1 || char.IsSpaceChar(lastCh))) { // require it to be a standalone punctuation mark -- cf. URLs sentenceString += newChar; sentenceEnd = true; } else { if (!segmented && fullStopsSet.Contains(c)) { // EncodingPrintWriter.out.println(" End of sent char", "UTF-8"); sentenceString += newChar; sentenceEnd = true; } else { sentenceString += newChar; } } } else { // sentenceEnd == true if (rightMarkSet.Contains(c)) { sentenceString += newChar; } else { // EncodingPrintWriter.out.println(" Right mark char", "UTF-8"); if (newChar.Matches("\\s")) { sentenceString += newChar; } else { if (fullStopsSet.Contains(c)) { // EncodingPrintWriter.out.println(" End of sent char (2+)", "UTF-8"); sentenceString += newChar; } else { // otherwise if (sentenceString.Length > 0) { sentenceEnd = false; } sentenceString = RemoveWhitespace(sentenceString, segmented); if (sentenceString.Length > 0) { //log.info("<<< "+sentenceString+" >>>"); sentenceList.Add(sentenceString); } sentenceString = string.Empty; sentenceString += newChar; } } } } lastCh = c; } // end for (Character c : content) sentenceString = RemoveWhitespace(sentenceString, segmented); if (sentenceString.Length > 0) { //log.info("<<< "+sentenceString+" >>>"); sentenceList.Add(sentenceString); } return(sentenceList); }