コード例 #1
0
        static SimpleTermEntry ReadAttributesFromString(string attrStr, string termStr, NumberFormatInfo nfi)
        {
            SimpleTermEntry ste = new SimpleTermEntry();

            ste.term  = termStr.Trim();
            ste.count = 1;
            string tempStr = attrStr;

            while (true)
            {
                int sIdx  = tempStr.IndexOf("SCORE");
                int lIdx  = tempStr.IndexOf("LEMMA");
                int mIdx  = tempStr.IndexOf("MSD");
                int nIdx  = tempStr.IndexOf("NORM");
                int nmIdx = tempStr.IndexOf("NORMMSD");
                if (sIdx >= 0 && (sIdx < lIdx || lIdx < 0) && (sIdx < mIdx || mIdx < 0) && (sIdx < nIdx || nIdx < 0) && (sIdx < nmIdx || nmIdx < 0))
                {
                    try{
                        ste.prob = Convert.ToDouble(GetAttrStr(tempStr, sIdx + 5).Trim(), nfi);
                        tempStr  = tempStr.Substring(sIdx + 5);
                    }
                    catch
                    {
                        tempStr = tempStr.Substring(sIdx + 5);
                    }
                }
                else if (mIdx >= 0 && (mIdx < lIdx || lIdx < 0) && (mIdx < sIdx || sIdx < 0) && (mIdx < nIdx || nIdx < 0) && (mIdx < nmIdx || nmIdx < 0))
                {
                    ste.msdSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, mIdx + 3));
                    tempStr    = tempStr.Substring(mIdx + 3);
                }
                else if (lIdx >= 0 && (lIdx < sIdx || sIdx < 0) && (lIdx < mIdx || mIdx < 0) && (lIdx < nIdx || nIdx < 0) && (lIdx < nmIdx || nmIdx < 0))
                {
                    ste.lemmaSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, lIdx + 5));
                    tempStr      = tempStr.Substring(lIdx + 5);
                }
                else if (nIdx >= 0 && (nIdx < sIdx || sIdx < 0) && (nIdx < mIdx || mIdx < 0) && (nIdx < lIdx || lIdx < 0) && (nIdx < nmIdx || nmIdx < 0))
                {
                    ste.normSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, nIdx + 4));
                    tempStr     = tempStr.Substring(nIdx + 4);
                }
                else if (nmIdx >= 0 && (nmIdx < sIdx || sIdx < 0) && (nmIdx < mIdx || mIdx < 0) && (nmIdx <= nIdx || nIdx < 0) && (nmIdx < lIdx || lIdx < 0))
                {
                    ste.normMsdSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, nmIdx + 6));
                    tempStr        = tempStr.Substring(nmIdx + 6);
                }
                else
                {
                    break;
                }
            }

            if (ste.lemmaSeq.Contains("_") && !ste.term.Contains("_"))
            {
                ste.lemmaSeq = ste.lemmaSeq.Replace("_", " ");
            }

            return(ste);
        }
コード例 #2
0
        /// <summary>
        /// Parses a string and returns a list of found terms.
        /// Terms should not be overlapping or nested!
        /// </summary>
        /// <returns>
        /// The terms in a <c>Dictionary</c>.
        /// </returns>
        /// <param name='text'>
        /// Input text.
        /// </param>
        /// <param name='concLen'>
        /// The length of the concordance that has to be retrieved for each term.
        /// </param>
        public static Dictionary <string, SimpleTermEntry> ParseTermsInString(string text, NumberFormatInfo nfi, int concLen)
        {
            string tempText = text;
            string plain    = GetPlaintextFromTaggedString(text);
            Dictionary <string, SimpleTermEntry> res = new Dictionary <string, SimpleTermEntry>();

            //List<string> partialText = new List<string> ();
            while (true)
            {
                int idx    = tempText.IndexOf("<TENAME");
                int endIdx = tempText.IndexOf("</TENAME");
                if (idx >= 0 && endIdx > idx)
                {
                    int clIdx = tempText.IndexOf('>', idx);
                    if (clIdx >= 0 && clIdx < endIdx)
                    {
                        string attrStr = tempText.Substring(idx + 7, clIdx - idx - 7);
                        string termStr = tempText.Substring(clIdx + 1, endIdx - clIdx - 1);

                        SimpleTermEntry ste = ReadAttributesFromString(attrStr, termStr, nfi);
                        if (ste != null)
                        {
                            if (!res.ContainsKey(ste.term.ToLower()))
                            {
                                string conc = FindConcordance(ste.term, plain, concLen);
                                ste.conc = conc;
                                res.Add(ste.term.ToLower(), ste);
                            }
                            else
                            {
                                string conc = FindConcordance(ste.term, plain, concLen);
                                if (string.IsNullOrWhiteSpace(ste.conc) || conc.Length > ste.conc.Length)
                                {
                                    ste.conc = conc;
                                }
                                res [ste.term.ToLower()].count++;
                            }
                        }
                    }
                    tempText = tempText.Substring(endIdx + 1);
                }
                else if (endIdx < idx && endIdx >= 0)
                {
                    int clIdx = tempText.IndexOf('>', Math.Max(idx, endIdx));
                    tempText = tempText.Substring(endIdx + 1);
                }
                else
                {
                    break;
                }
            }

            /*
             * while (true) {
             *  int idx = tempText.IndexOf ("<TENAME");
             *  int endIdx = tempText.IndexOf ("</TENAME");
             *  if (idx >= 0 && endIdx > idx) {
             *      if (idx > 0 && partialText.Count > 0) {
             *          string addableContent = tempText.Substring (0, idx);
             *          for (int i=0; i<partialText.Count; i++) {
             *              partialText [i] += addableContent;
             *          }
             *      }
             *      partialText.Add ("");
             *
             *      int clIdx = tempText.IndexOf ('>', idx);
             *      tempText = tempText.Substring (clIdx + 1);
             *  } else if (endIdx >= 0) {
             *      if (endIdx > 0 && partialText.Count > 0) {
             *          string addableContent = tempText.Substring (0, endIdx);
             *          for (int i=0; i<partialText.Count; i++) {
             *              partialText [i] += addableContent;
             *          }
             *      }
             *      int clIdx = tempText.IndexOf ('>', endIdx);
             *      if (clIdx>=0)
             *      {
             *          tempText = tempText.Substring (clIdx + 1);
             *      }
             *      else
             *      {
             *          tempText = tempText.Substring (endIdx + 8);
             *      }
             *      if (partialText.Count>0)
             *      {
             *          if (!res.ContainsKey(partialText[partialText.Count-1]))
             *          {
             *              res.Add(partialText[partialText.Count-1],1);
             *          }
             *          else
             *          {
             *              res[partialText[partialText.Count-1]]++;
             *          }
             *          partialText.RemoveAt(partialText.Count-1);
             *      }
             *  } else {
             *      break;
             *  }
             * }*/
            return(res);
        }
コード例 #3
0
        static SimpleTermEntry ReadAttributesFromString(string attrStr, string termStr, NumberFormatInfo nfi)
        {
            SimpleTermEntry ste = new SimpleTermEntry ();
            ste.term = termStr.Trim();
            ste.count = 1;
            string tempStr = attrStr;
            while (true) {
                int sIdx = tempStr.IndexOf("SCORE");
                int lIdx = tempStr.IndexOf("LEMMA");
                int mIdx = tempStr.IndexOf("MSD");
                int nIdx = tempStr.IndexOf("NORM");
                int nmIdx = tempStr.IndexOf("NORMMSD");
                if (sIdx>=0 && (sIdx<lIdx || lIdx<0 ) && (sIdx<mIdx || mIdx<0) && (sIdx<nIdx || nIdx<0) && (sIdx<nmIdx || nmIdx<0))
                {
                    try{
                        ste.prob = Convert.ToDouble(GetAttrStr(tempStr, sIdx+5).Trim(),nfi);
                        tempStr = tempStr.Substring(sIdx+5);
                    }
                    catch
                    {
                        tempStr = tempStr.Substring(sIdx+5);
                    }
                }
                else if (mIdx>=0 && (mIdx<lIdx || lIdx<0 ) && (mIdx<sIdx || sIdx<0) && (mIdx<nIdx || nIdx<0) && (mIdx<nmIdx || nmIdx<0))
                {
                    ste.msdSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, mIdx+3));
                    tempStr = tempStr.Substring(mIdx+3);
                }
                else if (lIdx>=0 && (lIdx<sIdx || sIdx<0 ) && (lIdx<mIdx || mIdx<0) && (lIdx<nIdx || nIdx<0) && (lIdx<nmIdx || nmIdx<0))
                {
                    ste.lemmaSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, lIdx+5));
                    tempStr = tempStr.Substring(lIdx+5);
                }
                else if (nIdx>=0 && (nIdx<sIdx || sIdx<0 ) && (nIdx<mIdx || mIdx<0) && (nIdx<lIdx || lIdx<0) && (nIdx<nmIdx || nmIdx<0))
                {
                    ste.normSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, nIdx+4));
                    tempStr = tempStr.Substring(nIdx+4);
                }
                else if (nmIdx>=0 && (nmIdx<sIdx || sIdx<0 ) && (nmIdx<mIdx || mIdx<0) && (nmIdx<=nIdx || nIdx<0) && (nmIdx<lIdx || lIdx<0))
                {
                    ste.normMsdSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, nmIdx+6));
                    tempStr = tempStr.Substring(nmIdx+6);
                }
                else
                {
                    break;
                }
            }

            if (ste.lemmaSeq.Contains("_")&&!ste.term.Contains("_"))
            {
                ste.lemmaSeq = ste.lemmaSeq.Replace("_"," ");
            }

            return ste;
        }