private void indexSense(ReadOnlyCollection <EquivToken> tokens, int entryId, int senseIx) { // If there are no non-Chinese, non-number tokens: nothing to save, nothing to index bool relevant = false; foreach (EquivToken eqt in tokens) { if (eqt.TokenId != index.WordHolder.IdZho && eqt.TokenId != index.WordHolder.IdNum) { relevant = true; break; } } if (!relevant) { return; } // Keep tokenized sense in memory int senseId = tsenses.Count; TokenizedSense ts = new TokenizedSense(entryId, senseIx, tokens); tsenses.Add(ts); // Add to instance list of each token in list // First get set of different token IDs - we don't index dupes HashSet <int> tokenIdSet = new HashSet <int>(); foreach (EquivToken eqt in tokens) { tokenIdSet.Add(eqt.TokenId); } // Now, index each distinct ID foreach (int tokenId in tokenIdSet) { SenseIndexItem sii; if (!index.SenseIndex.ContainsKey(tokenId)) { sii = new SenseIndexItem(); index.SenseIndex[tokenId] = sii; } else { sii = index.SenseIndex[tokenId]; } if (tokenIdSet.Count > byte.MaxValue) { throw new Exception("Sense's token count out of byte range: " + tokenIdSet.Count.ToString()); } SenseInfo senseInfo = new SenseInfo { TokenizedSenseId = senseId, TokensInSense = (byte)tokenIdSet.Count, }; sii.Instances.Add(senseInfo); } }
/// <summary> /// <para>Looks for query text in tokenized sense, returns corresponding target highlight if found.</para> /// <para>If not found (sense doesn't contain query as a sequence), returns null.</para> /// </summary> private CedictTargetHighlight doFindTargetQuery(ReadOnlyCollection <EquivToken> txtTokenized, TokenizedSense ts) { for (int i = 0; i <= ts.EquivTokens.Count - txtTokenized.Count; ++i) { int j = 0; for (; j != txtTokenized.Count; ++j) { if (txtTokenized[j].TokenId != ts.EquivTokens[i + j].TokenId) { break; } } // If we found full query text: create highlight now if (j == txtTokenized.Count) { // Query is a single token if (txtTokenized.Count == 1) { return(new CedictTargetHighlight(ts.SenseIx, ts.EquivTokens[i].RunIx, ts.EquivTokens[i].StartInRun, ts.EquivTokens[i].LengthInRun)); } // Query is multiple tokens else { // Sanity check: all tokens in tokenized sense must be from same text run // We don't even index across multiple runs // And definitely don't look up queries that have Hanzi in the middle if (ts.EquivTokens[i].RunIx != ts.EquivTokens[i + j - 1].RunIx) { throw new Exception("Entire query string should be within a single text run in sense's equiv."); } int hlStart = ts.EquivTokens[i].StartInRun; int hlEnd = ts.EquivTokens[i + j - 1].StartInRun + ts.EquivTokens[i + j - 1].LengthInRun; return(new CedictTargetHighlight(ts.SenseIx, ts.EquivTokens[i].RunIx, hlStart, hlEnd - hlStart)); } } } // Sequence not found return(null); }
/// <summary> /// <para>Verifies if a sense that contains all query tokens is really a match.</para> /// </summary> /// <param name="txtTokenized">The tokenized query text.</param> /// <param name="sensePos">The data position of the tokenized sense to verify.</param> /// <param name="entryIdToInfo">Container for kept entry matches.</param> /// <param name="br">Binary data source to read up tokenized sense.</param> private void doVerifyTarget(ReadOnlyCollection <EquivToken> txtTokenized, int sensePos, Dictionary <int, EntryMatchInfo> entryIdToInfo, BinReader br) { // Load tokenized sense br.Position = sensePos; TokenizedSense ts = new TokenizedSense(br); // Find query tokens in tokenized sense // This will be our highlight too! CedictTargetHighlight hilite = doFindTargetQuery(txtTokenized, ts); // No highlight: no match if (hilite == null) { return; } // Score is length of query (in tokens) divided by count of tokense in sense float score = ((float)txtTokenized.Count) / ((float)ts.EquivTokens.Count); // If we found query string, it's a match; we can go on and record best score and hilight if (!entryIdToInfo.ContainsKey(ts.EntryId)) { EntryMatchInfo emi = new EntryMatchInfo { EntryId = ts.EntryId, BestSenseScore = score, }; emi.TargetHilites.Add(hilite); entryIdToInfo[ts.EntryId] = emi; } else { EntryMatchInfo emi = entryIdToInfo[ts.EntryId]; if (score > emi.BestSenseScore) { emi.BestSenseScore = score; } emi.TargetHilites.Add(hilite); } }
/// <summary> /// <para>Looks for query text in tokenized sense, returns corresponding target highlight if found.</para> /// <para>If not found (sense doesn't contain query as a sequence), returns null.</para> /// </summary> private CedictTargetHighlight doFindTargetQuery(ReadOnlyCollection<EquivToken> txtTokenized, TokenizedSense ts) { for (int i = 0; i <= ts.EquivTokens.Count - txtTokenized.Count; ++i) { int j = 0; for (; j != txtTokenized.Count; ++j) { if (txtTokenized[j].TokenId != ts.EquivTokens[i + j].TokenId) break; } // If we found full query text: create highlight now if (j == txtTokenized.Count) { // Query is a single token if (txtTokenized.Count == 1) { return new CedictTargetHighlight(ts.SenseIx, ts.EquivTokens[i].RunIx, ts.EquivTokens[i].StartInRun, ts.EquivTokens[i].LengthInRun); } // Query is multiple tokens else { // Sanity check: all tokens in tokenized sense must be from same text run // We don't even index across multiple runs // And definitely don't look up queries that have Hanzi in the middle if (ts.EquivTokens[i].RunIx != ts.EquivTokens[i + j - 1].RunIx) throw new Exception("Entire query string should be within a single text run in sense's equiv."); int hlStart = ts.EquivTokens[i].StartInRun; int hlEnd = ts.EquivTokens[i + j - 1].StartInRun + ts.EquivTokens[i + j - 1].LengthInRun; return new CedictTargetHighlight(ts.SenseIx, ts.EquivTokens[i].RunIx, hlStart, hlEnd - hlStart); } } } // Sequence not found return null; }
/// <summary> /// <para>Verifies if a sense that contains all query tokens is really a match.</para> /// </summary> /// <param name="txtTokenized">The tokenized query text.</param> /// <param name="sensePos">The data position of the tokenized sense to verify.</param> /// <param name="entryIdToInfo">Container for kept entry matches.</param> /// <param name="br">Binary data source to read up tokenized sense.</param> private void doVerifyTarget(ReadOnlyCollection<EquivToken> txtTokenized, int sensePos, Dictionary<int, EntryMatchInfo> entryIdToInfo, BinReader br) { // Load tokenized sense br.Position = sensePos; TokenizedSense ts = new TokenizedSense(br); // Find query tokens in tokenized sense // This will be our highlight too! CedictTargetHighlight hilite = doFindTargetQuery(txtTokenized, ts); // No highlight: no match if (hilite == null) return; // Score is length of query (in tokens) divided by count of tokense in sense float score = ((float)txtTokenized.Count) / ((float)ts.EquivTokens.Count); // If we found query string, it's a match; we can go on and record best score and hilight if (!entryIdToInfo.ContainsKey(ts.EntryId)) { EntryMatchInfo emi = new EntryMatchInfo { EntryId = ts.EntryId, BestSenseScore = score, }; emi.TargetHilites.Add(hilite); entryIdToInfo[ts.EntryId] = emi; } else { EntryMatchInfo emi = entryIdToInfo[ts.EntryId]; if (score > emi.BestSenseScore) emi.BestSenseScore = score; emi.TargetHilites.Add(hilite); } }
private void indexSense(ReadOnlyCollection<EquivToken> tokens, int entryId, int senseIx) { // If there are no non-Chinese, non-number tokens: nothing to save, nothing to index bool relevant = false; foreach (EquivToken eqt in tokens) { if (eqt.TokenId != index.WordHolder.IdZho && eqt.TokenId != index.WordHolder.IdNum) { relevant = true; break; } } if (!relevant) return; // Keep tokenized sense in memory int senseId = tsenses.Count; TokenizedSense ts = new TokenizedSense(entryId, senseIx, tokens); tsenses.Add(ts); // Add to instance list of each token in list // First get set of different token IDs - we don't index dupes HashSet<int> tokenIdSet = new HashSet<int>(); foreach (EquivToken eqt in tokens) tokenIdSet.Add(eqt.TokenId); // Now, index each distinct ID foreach (int tokenId in tokenIdSet) { SenseIndexItem sii; if (!index.SenseIndex.ContainsKey(tokenId)) { sii = new SenseIndexItem(); index.SenseIndex[tokenId] = sii; } else sii = index.SenseIndex[tokenId]; if (tokenIdSet.Count > byte.MaxValue) throw new Exception("Sense's token count out of byte range: " + tokenIdSet.Count.ToString()); SenseInfo senseInfo = new SenseInfo { TokenizedSenseId = senseId, TokensInSense = (byte)tokenIdSet.Count, }; sii.Instances.Add(senseInfo); } }