Exemple #1
0
        /*
         * public ChineseDocumentToSentenceProcessor(String normalizationTableFile, String encoding) {
         * log.info("WARNING: ChineseDocumentToSentenceProcessor ignores normalizationTableFile argument!");
         * log.info("WARNING: ChineseDocumentToSentenceProcessor ignores encoding argument!");
         * // encoding is never read locally
         * this.encoding = encoding;
         * }
         */
        /// <summary>
        /// This should now become disused, and other people should call
        /// ChineseUtils directly!  CDM June 2006.
        /// </summary>
        public virtual string Normalization(string @in)
        {
            //log.info("BEFOR NORM: "+in);
            string norm = ChineseUtils.Normalize(@in);
            string @out = Normalize(norm);

            //log.info("AFTER NORM: "+out);
            return(@out);
        }
Exemple #2
0
        public virtual void TestNormalize()
        {
            string input     = "Hello  English - 你好\u3000班汉·西巴阿差\u3000Chris•Manning \uD83E\uDD16\uD83E\uDD16robot";
            string outputLLL = "Hello  English - 你好\u3000班汉·西巴阿差\u3000Chris•Manning \uD83E\uDD16\uD83E\uDD16robot";
            string outputAAN = "Hello  English - 你好 班汉·西巴阿差 Chris·Manning \uD83E\uDD16\uD83E\uDD16robot";
            string outputFFF = "Hello\u3000\u3000English\u3000-\u3000你好 班汉・西巴阿差 Chris・Manning\u3000\uD83E\uDD16\uD83E\uDD16robot";

            NUnit.Framework.Assert.AreEqual(outputLLL, ChineseUtils.Normalize(input, ChineseUtils.Leave, ChineseUtils.Leave, ChineseUtils.Leave));
            NUnit.Framework.Assert.AreEqual(outputAAN, ChineseUtils.Normalize(input, ChineseUtils.Ascii, ChineseUtils.Ascii, ChineseUtils.Normalize));
            NUnit.Framework.Assert.AreEqual(outputFFF, ChineseUtils.Normalize(input, ChineseUtils.Fullwidth, ChineseUtils.Fullwidth, ChineseUtils.Fullwidth));
        }
        private static ICollection <string> ReadDict(string filename, bool normalize)
        {
            ICollection <string> word = Generics.NewHashSet();

            logger.Info("Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);
            try
            {
                using (InputStream @is = IOUtils.GetInputStreamFromURLOrClasspathOrFileSystem(filename))
                {
                    BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(@is, "UTF-8"));
                    int            i = 0;
                    for (string wordDetectorLine; (wordDetectorLine = wordDetectorReader.ReadLine()) != null;)
                    {
                        i++;
                        //String[] fields = wordDetectorLine.split("	");
                        //logger.debug("DEBUG: "+filename+" "+wordDetectorLine);
                        int origLeng = wordDetectorLine.Length;
                        wordDetectorLine = wordDetectorLine.Trim();
                        int newLeng = wordDetectorLine.Length;
                        if (newLeng != origLeng)
                        {
                            EncodingPrintWriter.Err.Println("Line " + i + " of " + filename + " has leading/trailing whitespace: |" + wordDetectorLine + "|", "UTF-8");
                        }
                        if (newLeng == 0)
                        {
                            EncodingPrintWriter.Err.Println("Line " + i + " of " + filename + " is empty", "UTF-8");
                        }
                        else
                        {
                            if (normalize)
                            {
                                wordDetectorLine = ChineseUtils.Normalize(wordDetectorLine, ChineseUtils.Ascii, ChineseUtils.Ascii, ChineseUtils.Normalize);
                            }
                            word.Add(wordDetectorLine);
                        }
                    }
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            return(word);
        }
        /// <summary>
        /// Recursively builds an answer lattice (Chinese words) from a Viterbi search graph
        /// of binary predictions.
        /// </summary>
        /// <remarks>
        /// Recursively builds an answer lattice (Chinese words) from a Viterbi search graph
        /// of binary predictions. This function does a limited amount of post-processing:
        /// preserve white spaces of the input, and not segment between two latin characters or
        /// between two digits. Consequently, the probabilities of all paths in answerLattice
        /// may not sum to 1 (they do sum to 1 if no post processing applies).
        /// </remarks>
        /// <param name="tSource">Current node in Viterbi search graph.</param>
        /// <param name="aSource">Current node in answer lattice.</param>
        /// <param name="answer">Partial word starting at aSource.</param>
        /// <param name="nodeId">Currently unused node identifier for answer graph.</param>
        /// <param name="pos">Current position in docArray.</param>
        /// <param name="cost">Current cost of answer.</param>
        /// <param name="stateLinks">
        /// Maps nodes of the search graph to nodes in answer lattice
        /// (when paths of the search graph are recombined, paths of the answer lattice should be
        /// recombined as well, if at word boundary).
        /// </param>
        private void TagLatticeToAnswerLattice(DFSAState <string, int> tSource, DFSAState <string, int> aSource, StringBuilder answer, MutableInteger nodeId, int pos, double cost, IDictionary <DFSAState <string, int>, DFSAState <string, int> > stateLinks,
                                               DFSA <string, int> answerLattice, CoreLabel[] docArray)
        {
            // Add "1" prediction after the end of the sentence, if applicable:
            if (tSource.IsAccepting() && tSource.ContinuingInputs().IsEmpty())
            {
                tSource.AddTransition(new DFSATransition <string, int>(string.Empty, tSource, new DFSAState <string, int>(-1, null), "1", string.Empty, 0));
            }
            // Get current label, character, and prediction:
            CoreLabel curLabel  = (pos < docArray.Length) ? docArray[pos] : null;
            string    curChr    = null;
            string    origSpace = null;

            if (curLabel != null)
            {
                curChr = curLabel.Get(typeof(CoreAnnotations.OriginalCharAnnotation));
                System.Diagnostics.Debug.Assert((curChr.Length == 1));
                origSpace = curLabel.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation));
            }
            // Get set of successors in search graph:
            ICollection <string> inputs = tSource.ContinuingInputs();
            // Only keep most probable transition out of initial state:
            string answerConstraint = null;

            if (pos == 0)
            {
                double minCost = double.PositiveInfinity;
                // DFSATransition<String, Integer> bestTransition = null;
                foreach (string predictSpace in inputs)
                {
                    DFSATransition <string, int> transition = tSource.Transition(predictSpace);
                    double transitionCost = transition.Score();
                    if (transitionCost < minCost)
                    {
                        if (predictSpace != null)
                        {
                            logger.Info(string.Format("mincost (%s): %e -> %e%n", predictSpace, minCost, transitionCost));
                            minCost          = transitionCost;
                            answerConstraint = predictSpace;
                        }
                    }
                }
            }
            // Follow along each transition:
            foreach (string predictSpace_1 in inputs)
            {
                DFSATransition <string, int> transition = tSource.Transition(predictSpace_1);
                DFSAState <string, int>      tDest      = transition.Target();
                DFSAState <string, int>      newASource = aSource;
                //logger.info(String.format("tsource=%s tdest=%s asource=%s pos=%d predictSpace=%s%n", tSource, tDest, newASource, pos, predictSpace));
                StringBuilder newAnswer = new StringBuilder(answer.ToString());
                int           answerLen = newAnswer.Length;
                string        prevChr   = (answerLen > 0) ? newAnswer.Substring(answerLen - 1) : null;
                double        newCost   = cost;
                // Ignore paths starting with zero:
                if (answerConstraint != null && !answerConstraint.Equals(predictSpace_1))
                {
                    logger.Info(string.Format("Skipping transition %s at pos 0.%n", predictSpace_1));
                    continue;
                }
                // Ignore paths not consistent with input segmentation:
                if (flags.keepAllWhitespaces && "0".Equals(predictSpace_1) && "1".Equals(origSpace))
                {
                    logger.Info(string.Format("Skipping non-boundary at pos %d, since space in the input.%n", pos));
                    continue;
                }
                // Ignore paths adding segment boundaries between two latin characters, or between two digits:
                // (unless already present in original input)
                if ("1".Equals(predictSpace_1) && "0".Equals(origSpace) && prevChr != null && curChr != null)
                {
                    char p = prevChr[0];
                    char c = curChr[0];
                    if (ChineseStringUtils.IsLetterASCII(p) && ChineseStringUtils.IsLetterASCII(c))
                    {
                        logger.Info(string.Format("Not hypothesizing a boundary at pos %d, since between two ASCII letters (%s and %s).%n", pos, prevChr, curChr));
                        continue;
                    }
                    if (ChineseUtils.IsNumber(p) && ChineseUtils.IsNumber(c))
                    {
                        logger.Info(string.Format("Not hypothesizing a boundary at pos %d, since between two numeral characters (%s and %s).%n", pos, prevChr, curChr));
                        continue;
                    }
                }
                // If predictSpace==1, create a new transition in answer search graph:
                if ("1".Equals(predictSpace_1))
                {
                    if (newAnswer.ToString().Length > 0)
                    {
                        // If answer destination node visited before, create a new edge and leave:
                        if (stateLinks.Contains(tSource))
                        {
                            DFSAState <string, int> aDest = stateLinks[tSource];
                            newASource.AddTransition(new DFSATransition <string, int>(string.Empty, newASource, aDest, newAnswer.ToString(), string.Empty, newCost));
                            //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n", newASource, aDest, newAnswer));
                            continue;
                        }
                        // If answer destination node not visited before, create it + new edge:
                        nodeId.IncValue(1);
                        DFSAState <string, int> aDest_1 = new DFSAState <string, int>(nodeId, answerLattice, 0.0);
                        stateLinks[tSource] = aDest_1;
                        newASource.AddTransition(new DFSATransition <string, int>(string.Empty, newASource, aDest_1, newAnswer.ToString(), string.Empty, newCost));
                        //logger.info(String.format("new edge: adest=%s%n", newASource, aDest, newAnswer));
                        //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n%n%n", newASource, aDest, newAnswer));
                        // Reached an accepting state:
                        if (tSource.IsAccepting())
                        {
                            aDest_1.SetAccepting(true);
                            continue;
                        }
                        // Start new answer edge:
                        newASource = aDest_1;
                        newAnswer  = new StringBuilder();
                        newCost    = 0.0;
                    }
                }
                System.Diagnostics.Debug.Assert((curChr != null));
                newAnswer.Append(curChr);
                newCost += transition.Score();
                if (newCost < flags.searchGraphPrune || ChineseStringUtils.IsLetterASCII(curChr[0]))
                {
                    TagLatticeToAnswerLattice(tDest, newASource, newAnswer, nodeId, pos + 1, newCost, stateLinks, answerLattice, docArray);
                }
            }
        }
Exemple #5
0
        public static string CombineSegmentedSentence(IList <CoreLabel> doc, SeqClassifierFlags flags)
        {
            // Hey all: Some of the code that was previously here for
            // whitespace normalization was a bit hackish as well as
            // obviously broken for some test cases. So...I went ahead and
            // re-wrote it.
            //
            // Also, putting everything into 'testContent', is a bit wasteful
            // memory wise. But, it's on my near-term todo list to
            // code something that's a bit more memory efficient.
            //
            // Finally, if these changes ended up breaking anything
            // just e-mail me ([email protected]), and I'll try to fix it
            // asap  -cer (6/14/2006)

            /* Sun Oct  7 19:55:09 2007
             * I'm actually not using "testContent" anymore.
             * I think it's broken because the whole test file has been read over and over again,
             * tand the testContentIdx has been set to 0 every time, while "doc" is moving
             * line by line!!!!
             * -pichuan
             */
            int           testContentIdx = 0;
            StringBuilder ans            = new StringBuilder();
            // the actual output we will return
            StringBuilder unmod_ans = new StringBuilder();
            // this is the original output from the CoreLabel
            StringBuilder unmod_normed_ans = new StringBuilder();
            // this is the original output from the CoreLabel
            CoreLabel wi = null;

            for (IEnumerator <CoreLabel> wordIter = doc.GetEnumerator(); wordIter.MoveNext(); testContentIdx++)
            {
                CoreLabel pwi = wi;
                wi = wordIter.Current;
                bool originalWhiteSpace = "1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation)));
                //  if the CRF says "START" (segmented), and it's not the first word..
                if (wi.Get(typeof(CoreAnnotations.AnswerAnnotation)).Equals("1") && !("0".Equals(wi.Get(typeof(CoreAnnotations.PositionAnnotation)).ToString())))
                {
                    // check if we need to preserve the "no space" between English
                    // characters
                    bool seg = true;
                    // since it's in the "1" condition.. default is to seg
                    if (flags.keepEnglishWhitespaces)
                    {
                        if (testContentIdx > 0)
                        {
                            char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                            char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                            if (IsLetterASCII(prevChar) && IsLetterASCII(currChar))
                            {
                                // keep the "non space" before wi
                                if (!originalWhiteSpace)
                                {
                                    seg = false;
                                }
                            }
                        }
                    }
                    // if there was space and keepAllWhitespaces is true, restore it no matter what
                    if (flags.keepAllWhitespaces && originalWhiteSpace)
                    {
                        seg = true;
                    }
                    if (seg)
                    {
                        if (originalWhiteSpace)
                        {
                            ans.Append('\u1924');
                        }
                        else
                        {
                            // a pretty Limbu character which is later changed to a space
                            ans.Append(' ');
                        }
                    }
                    unmod_ans.Append(' ');
                    unmod_normed_ans.Append(' ');
                }
                else
                {
                    bool seg = false;
                    // since it's in the "0" condition.. default
                    // Changed after conversation with Huihsin.
                    //
                    // Decided that all words consisting of English/ASCII characters
                    // should be separated from the surrounding Chinese characters. -cer

                    /* Sun Oct  7 22:14:46 2007 (pichuan)
                     * the comment above was from DanC.
                     * I changed the code but I think I'm doing the same thing here.
                     */
                    if (testContentIdx > 0)
                    {
                        char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                        char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                        if ((prevChar < (char)128) != (currChar < (char)128))
                        {
                            if (ChineseUtils.IsNumber(prevChar) && ChineseUtils.IsNumber(currChar))
                            {
                            }
                            else
                            {
                                // cdm: you would get here if you had an ASCII number next to a
                                // Unihan range number.  Does that happen?  It presumably
                                // shouldn't do any harm.... [cdm, oct 2007]
                                if (flags.separateASCIIandRange)
                                {
                                    seg = true;
                                }
                            }
                        }
                    }
                    if (flags.keepEnglishWhitespaces)
                    {
                        if (testContentIdx > 0)
                        {
                            char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                            char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                            if (IsLetterASCII(prevChar) && IsLetterASCII(currChar) || IsLetterASCII(prevChar) && ChineseUtils.IsNumber(currChar) || ChineseUtils.IsNumber(prevChar) && IsLetterASCII(currChar))
                            {
                                // keep the "space" before wi
                                if ("1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation))))
                                {
                                    seg = true;
                                }
                            }
                        }
                    }
                    // if there was space and keepAllWhitespaces is true, restore it no matter what
                    if (flags.keepAllWhitespaces)
                    {
                        if (!("0".Equals(wi.Get(typeof(CoreAnnotations.PositionAnnotation)).ToString())) && "1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation))))
                        {
                            seg = true;
                        }
                    }
                    if (seg)
                    {
                        if (originalWhiteSpace)
                        {
                            ans.Append('\u1924');
                        }
                        else
                        {
                            // a pretty Limbu character which is later changed to a space
                            ans.Append(' ');
                        }
                    }
                }
                ans.Append(wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation)));
                unmod_ans.Append(wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation)));
                unmod_normed_ans.Append(wi.Get(typeof(CoreAnnotations.CharAnnotation)));
            }
            string ansStr = ans.ToString();

            if (flags.sighanPostProcessing)
            {
                if (!flags.keepAllWhitespaces)
                {
                    // remove the Limbu char now, so it can be deleted in postprocessing
                    ansStr = ansStr.ReplaceAll("\u1924", " ");
                }
                ansStr = PostProcessingAnswer(ansStr, flags);
            }
            // definitely remove the Limbu char if it survived till now
            ansStr = ansStr.ReplaceAll("\u1924", " ");
            return(ansStr);
        }
Exemple #6
0
        /// <exception cref="System.IO.IOException"/>
        public static IList <string> FromPlainText(string contentString, bool segmented)
        {
            if (segmented)
            {
                contentString = ChineseUtils.Normalize(contentString, ChineseUtils.Leave, ChineseUtils.Ascii);
            }
            else
            {
                contentString = ChineseUtils.Normalize(contentString, ChineseUtils.Fullwidth, ChineseUtils.Ascii);
            }
            string sentenceString = string.Empty;

            char[]         content      = contentString.ToCharArray();
            bool           sentenceEnd  = false;
            IList <string> sentenceList = new List <string>();
            int            lastCh       = -1;

            foreach (char c in content)
            {
                // EncodingPrintWriter.out.println("Char is |" + c + "|", "UTF-8");
                string newChar = c.ToString();
                if (!sentenceEnd)
                {
                    if (segmented && fullStopsSet.Contains(c) && (lastCh == -1 || char.IsSpaceChar(lastCh)))
                    {
                        // require it to be a standalone punctuation mark -- cf. URLs
                        sentenceString += newChar;
                        sentenceEnd     = true;
                    }
                    else
                    {
                        if (!segmented && fullStopsSet.Contains(c))
                        {
                            // EncodingPrintWriter.out.println("  End of sent char", "UTF-8");
                            sentenceString += newChar;
                            sentenceEnd     = true;
                        }
                        else
                        {
                            sentenceString += newChar;
                        }
                    }
                }
                else
                {
                    // sentenceEnd == true
                    if (rightMarkSet.Contains(c))
                    {
                        sentenceString += newChar;
                    }
                    else
                    {
                        // EncodingPrintWriter.out.println("  Right mark char", "UTF-8");
                        if (newChar.Matches("\\s"))
                        {
                            sentenceString += newChar;
                        }
                        else
                        {
                            if (fullStopsSet.Contains(c))
                            {
                                // EncodingPrintWriter.out.println("  End of sent char (2+)", "UTF-8");
                                sentenceString += newChar;
                            }
                            else
                            {
                                // otherwise
                                if (sentenceString.Length > 0)
                                {
                                    sentenceEnd = false;
                                }
                                sentenceString = RemoveWhitespace(sentenceString, segmented);
                                if (sentenceString.Length > 0)
                                {
                                    //log.info("<<< "+sentenceString+" >>>");
                                    sentenceList.Add(sentenceString);
                                }
                                sentenceString  = string.Empty;
                                sentenceString += newChar;
                            }
                        }
                    }
                }
                lastCh = c;
            }
            // end for (Character c : content)
            sentenceString = RemoveWhitespace(sentenceString, segmented);
            if (sentenceString.Length > 0)
            {
                //log.info("<<< "+sentenceString+" >>>");
                sentenceList.Add(sentenceString);
            }
            return(sentenceList);
        }