/// <summary> /// <para>Tokenize the sense's equiv, presented as hybrid text.</para> /// <para>During parsing, creates new word IDs as tokens come up.</para> /// </summary> public ReadOnlyCollection <EquivToken> Tokenize(HybridText txt) { List <EquivToken> res = new List <EquivToken>(); int runIX = -1; foreach (TextRun tr in txt.Runs) { ++runIX; if (tr is TextRunZho) { int idZho = wh.IdZho; EquivToken eqt = new EquivToken { TokenId = idZho, RunIx = runIX, StartInRun = 0, LengthInRun = 0, }; res.Add(eqt); continue; } string str = tr.GetPlainText(); tokenizeRun(str, runIX, res); } return(new ReadOnlyCollection <EquivToken>(res)); }
/// <summary> /// Serialize to binary stream. /// </summary> public void Serialize(BinWriter bw) { bw.WriteInt(EntryId); bw.WriteInt(SenseIx); int equivTokenCount = EquivTokens.Count; bw.WriteInt(equivTokenCount); for (int i = 0; i != equivTokenCount; ++i) { EquivToken eqt = EquivTokens[i]; if (eqt.RunIx < byte.MinValue || eqt.RunIx > byte.MaxValue) { throw new Exception("RangeIx value out of byte range: " + eqt.StartInRun.ToString()); } if (eqt.StartInRun < short.MinValue || eqt.StartInRun > short.MaxValue) { throw new Exception("StartInSense value out of short range: " + eqt.StartInRun.ToString()); } if (eqt.LengthInRun < short.MinValue || eqt.LengthInRun > short.MaxValue) { throw new Exception("LengthInSense value out of short range: " + eqt.LengthInRun.ToString()); } byte rangeIx = (byte)eqt.RunIx; short startInSense = (short)eqt.StartInRun; short lengthInSense = (short)eqt.LengthInRun; bw.WriteInt(eqt.TokenId); bw.WriteByte(rangeIx); bw.WriteShort(startInSense); bw.WriteShort(lengthInSense); } }
/// <summary> /// Ctor: read from binary stream. /// </summary> public TokenizedSense(BinReader br) { EntryId = br.ReadInt(); SenseIx = br.ReadInt(); int equivTokenCount = br.ReadInt(); EquivTokens = new List <EquivToken>(equivTokenCount); for (int i = 0; i != equivTokenCount; ++i) { EquivToken eqt = new EquivToken(); eqt.TokenId = br.ReadInt(); eqt.RunIx = (int)br.ReadByte(); eqt.StartInRun = (int)br.ReadShort(); eqt.LengthInRun = (int)br.ReadShort(); EquivTokens.Add(eqt); } }
/// <summary> /// Tokenizes a single plain text run. /// </summary> /// <param name="str">Text of the plain text run.</param> /// <param name="runIx">Current run index (to be stored in result).</param> /// <param name="tokens">List of tokens to append to.</param> private void tokenizeRun(string str, int runIx, List <EquivToken> tokens) { // One flag for each character. // If -1, character is outside of any token (e.g., whitespace, punctuation) // Otherwise, number indicates token index, from 0 upwards int[] flags = new int[str.Length]; // At first, we assume entire text is a single token for (int i = 0; i != flags.Length; ++i) { flags[i] = 0; } // Split by space and dash int pos; // Trim from start for (pos = 0; pos != flags.Length; ++pos) { char c = str[pos]; if (c != ' ' && c != '-') { break; } flags[pos] = -1; } // Move on; when encountering ' ' or '-' again, segment for (; pos != flags.Length; ++pos) { char c = str[pos]; // Space or dash here: mark as non-token; increase token IX of rest of string if (c == ' ' || c == '-') { flags[pos] = -1; for (int i = pos + 1; i < flags.Length; ++i) { ++flags[i]; } } } // Trim punctuation from start and end of each token pos = 0; while (pos != flags.Length) { // Skip non-tokens if (flags[pos] == -1) { ++pos; continue; } // Find end of token int tokenStart = pos; int tokenLength = 0; for (int i = pos; i != flags.Length; ++i) { // Same token as long as index is the same if (flags[i] == flags[pos]) { ++tokenLength; } else { break; } } // We'll move on in input by length of this token pos += tokenLength; // Trim punctuation from this token trimPunct(str, flags, tokenStart, tokenLength); } // We're done splitting and trimming, now create an EquivToken for each token. pos = 0; while (pos != flags.Length) { if (flags[pos] == -1) { ++pos; continue; } // Find end of token int tokenStart = pos; int tokenLength = 0; for (int i = pos; i != flags.Length; ++i) { // Same token as long as index is the same if (flags[i] == flags[pos]) { ++tokenLength; } else { break; } } pos += tokenLength; // Text of this token, lower-cased, OR *num* string strToken = str.Substring(tokenStart, tokenLength); if (reNumbers.IsMatch(strToken)) { strToken = WordHolder.TokenNum; } else { strToken = strToken.ToLowerInvariant(); } int tokenId = wh.GetTokenId(strToken); EquivToken eqt = new EquivToken { TokenId = tokenId, RunIx = runIx, StartInRun = tokenStart, LengthInRun = tokenLength, }; tokens.Add(eqt); } }
/// <summary> /// Ctor: read from binary stream. /// </summary> public TokenizedSense(BinReader br) { EntryId = br.ReadInt(); SenseIx = br.ReadInt(); int equivTokenCount = br.ReadInt(); EquivTokens = new List<EquivToken>(equivTokenCount); for (int i = 0; i != equivTokenCount; ++i) { EquivToken eqt = new EquivToken(); eqt.TokenId = br.ReadInt(); eqt.RunIx = (int)br.ReadByte(); eqt.StartInRun = (int)br.ReadShort(); eqt.LengthInRun = (int)br.ReadShort(); EquivTokens.Add(eqt); } }