static SimpleTermEntry ReadAttributesFromString(string attrStr, string termStr, NumberFormatInfo nfi) { SimpleTermEntry ste = new SimpleTermEntry(); ste.term = termStr.Trim(); ste.count = 1; string tempStr = attrStr; while (true) { int sIdx = tempStr.IndexOf("SCORE"); int lIdx = tempStr.IndexOf("LEMMA"); int mIdx = tempStr.IndexOf("MSD"); int nIdx = tempStr.IndexOf("NORM"); int nmIdx = tempStr.IndexOf("NORMMSD"); if (sIdx >= 0 && (sIdx < lIdx || lIdx < 0) && (sIdx < mIdx || mIdx < 0) && (sIdx < nIdx || nIdx < 0) && (sIdx < nmIdx || nmIdx < 0)) { try{ ste.prob = Convert.ToDouble(GetAttrStr(tempStr, sIdx + 5).Trim(), nfi); tempStr = tempStr.Substring(sIdx + 5); } catch { tempStr = tempStr.Substring(sIdx + 5); } } else if (mIdx >= 0 && (mIdx < lIdx || lIdx < 0) && (mIdx < sIdx || sIdx < 0) && (mIdx < nIdx || nIdx < 0) && (mIdx < nmIdx || nmIdx < 0)) { ste.msdSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, mIdx + 3)); tempStr = tempStr.Substring(mIdx + 3); } else if (lIdx >= 0 && (lIdx < sIdx || sIdx < 0) && (lIdx < mIdx || mIdx < 0) && (lIdx < nIdx || nIdx < 0) && (lIdx < nmIdx || nmIdx < 0)) { ste.lemmaSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, lIdx + 5)); tempStr = tempStr.Substring(lIdx + 5); } else if (nIdx >= 0 && (nIdx < sIdx || sIdx < 0) && (nIdx < mIdx || mIdx < 0) && (nIdx < lIdx || lIdx < 0) && (nIdx < nmIdx || nmIdx < 0)) { ste.normSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, nIdx + 4)); tempStr = tempStr.Substring(nIdx + 4); } else if (nmIdx >= 0 && (nmIdx < sIdx || sIdx < 0) && (nmIdx < mIdx || mIdx < 0) && (nmIdx <= nIdx || nIdx < 0) && (nmIdx < lIdx || lIdx < 0)) { ste.normMsdSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, nmIdx + 6)); tempStr = tempStr.Substring(nmIdx + 6); } else { break; } } if (ste.lemmaSeq.Contains("_") && !ste.term.Contains("_")) { ste.lemmaSeq = ste.lemmaSeq.Replace("_", " "); } return(ste); }
/// <summary> /// Parses a string and returns a list of found terms. /// Terms should not be overlapping or nested! /// </summary> /// <returns> /// The terms in a <c>Dictionary</c>. /// </returns> /// <param name='text'> /// Input text. /// </param> /// <param name='concLen'> /// The length of the concordance that has to be retrieved for each term. /// </param> public static Dictionary <string, SimpleTermEntry> ParseTermsInString(string text, NumberFormatInfo nfi, int concLen) { string tempText = text; string plain = GetPlaintextFromTaggedString(text); Dictionary <string, SimpleTermEntry> res = new Dictionary <string, SimpleTermEntry>(); //List<string> partialText = new List<string> (); while (true) { int idx = tempText.IndexOf("<TENAME"); int endIdx = tempText.IndexOf("</TENAME"); if (idx >= 0 && endIdx > idx) { int clIdx = tempText.IndexOf('>', idx); if (clIdx >= 0 && clIdx < endIdx) { string attrStr = tempText.Substring(idx + 7, clIdx - idx - 7); string termStr = tempText.Substring(clIdx + 1, endIdx - clIdx - 1); SimpleTermEntry ste = ReadAttributesFromString(attrStr, termStr, nfi); if (ste != null) { if (!res.ContainsKey(ste.term.ToLower())) { string conc = FindConcordance(ste.term, plain, concLen); ste.conc = conc; res.Add(ste.term.ToLower(), ste); } else { string conc = FindConcordance(ste.term, plain, concLen); if (string.IsNullOrWhiteSpace(ste.conc) || conc.Length > ste.conc.Length) { ste.conc = conc; } res [ste.term.ToLower()].count++; } } } tempText = tempText.Substring(endIdx + 1); } else if (endIdx < idx && endIdx >= 0) { int clIdx = tempText.IndexOf('>', Math.Max(idx, endIdx)); tempText = tempText.Substring(endIdx + 1); } else { break; } } /* * while (true) { * int idx = tempText.IndexOf ("<TENAME"); * int endIdx = tempText.IndexOf ("</TENAME"); * if (idx >= 0 && endIdx > idx) { * if (idx > 0 && partialText.Count > 0) { * string addableContent = tempText.Substring (0, idx); * for (int i=0; i<partialText.Count; i++) { * partialText [i] += addableContent; * } * } * partialText.Add (""); * * int clIdx = tempText.IndexOf ('>', idx); * tempText = tempText.Substring (clIdx + 1); * } else if (endIdx >= 0) { * if (endIdx > 0 && partialText.Count > 0) { * string addableContent = tempText.Substring (0, endIdx); * for (int i=0; i<partialText.Count; i++) { * partialText [i] += addableContent; * } * } * int clIdx = tempText.IndexOf ('>', endIdx); * if (clIdx>=0) * { * tempText = tempText.Substring (clIdx + 1); * } * else * { * tempText = tempText.Substring (endIdx + 8); * } * if (partialText.Count>0) * { * if (!res.ContainsKey(partialText[partialText.Count-1])) * { * res.Add(partialText[partialText.Count-1],1); * } * else * { * res[partialText[partialText.Count-1]]++; * } * partialText.RemoveAt(partialText.Count-1); * } * } else { * break; * } * }*/ return(res); }
static SimpleTermEntry ReadAttributesFromString(string attrStr, string termStr, NumberFormatInfo nfi) { SimpleTermEntry ste = new SimpleTermEntry (); ste.term = termStr.Trim(); ste.count = 1; string tempStr = attrStr; while (true) { int sIdx = tempStr.IndexOf("SCORE"); int lIdx = tempStr.IndexOf("LEMMA"); int mIdx = tempStr.IndexOf("MSD"); int nIdx = tempStr.IndexOf("NORM"); int nmIdx = tempStr.IndexOf("NORMMSD"); if (sIdx>=0 && (sIdx<lIdx || lIdx<0 ) && (sIdx<mIdx || mIdx<0) && (sIdx<nIdx || nIdx<0) && (sIdx<nmIdx || nmIdx<0)) { try{ ste.prob = Convert.ToDouble(GetAttrStr(tempStr, sIdx+5).Trim(),nfi); tempStr = tempStr.Substring(sIdx+5); } catch { tempStr = tempStr.Substring(sIdx+5); } } else if (mIdx>=0 && (mIdx<lIdx || lIdx<0 ) && (mIdx<sIdx || sIdx<0) && (mIdx<nIdx || nIdx<0) && (mIdx<nmIdx || nmIdx<0)) { ste.msdSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, mIdx+3)); tempStr = tempStr.Substring(mIdx+3); } else if (lIdx>=0 && (lIdx<sIdx || sIdx<0 ) && (lIdx<mIdx || mIdx<0) && (lIdx<nIdx || nIdx<0) && (lIdx<nmIdx || nmIdx<0)) { ste.lemmaSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, lIdx+5)); tempStr = tempStr.Substring(lIdx+5); } else if (nIdx>=0 && (nIdx<sIdx || sIdx<0 ) && (nIdx<mIdx || mIdx<0) && (nIdx<lIdx || lIdx<0) && (nIdx<nmIdx || nmIdx<0)) { ste.normSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, nIdx+4)); tempStr = tempStr.Substring(nIdx+4); } else if (nmIdx>=0 && (nmIdx<sIdx || sIdx<0 ) && (nmIdx<mIdx || mIdx<0) && (nmIdx<=nIdx || nIdx<0) && (nmIdx<lIdx || lIdx<0)) { ste.normMsdSeq = System.Net.WebUtility.HtmlDecode(GetAttrStr(tempStr, nmIdx+6)); tempStr = tempStr.Substring(nmIdx+6); } else { break; } } if (ste.lemmaSeq.Contains("_")&&!ste.term.Contains("_")) { ste.lemmaSeq = ste.lemmaSeq.Replace("_"," "); } return ste; }