Пример #1
0
        private void indexSense(ReadOnlyCollection <EquivToken> tokens, int entryId, int senseIx)
        {
            // If there are no non-Chinese, non-number tokens: nothing to save, nothing to index
            bool relevant = false;

            foreach (EquivToken eqt in tokens)
            {
                if (eqt.TokenId != index.WordHolder.IdZho && eqt.TokenId != index.WordHolder.IdNum)
                {
                    relevant = true; break;
                }
            }
            if (!relevant)
            {
                return;
            }

            // Keep tokenized sense in memory
            int            senseId = tsenses.Count;
            TokenizedSense ts      = new TokenizedSense(entryId, senseIx, tokens);

            tsenses.Add(ts);
            // Add to instance list of each token in list
            // First get set of different token IDs - we don't index dupes
            HashSet <int> tokenIdSet = new HashSet <int>();

            foreach (EquivToken eqt in tokens)
            {
                tokenIdSet.Add(eqt.TokenId);
            }
            // Now, index each distinct ID
            foreach (int tokenId in tokenIdSet)
            {
                SenseIndexItem sii;
                if (!index.SenseIndex.ContainsKey(tokenId))
                {
                    sii = new SenseIndexItem();
                    index.SenseIndex[tokenId] = sii;
                }
                else
                {
                    sii = index.SenseIndex[tokenId];
                }
                if (tokenIdSet.Count > byte.MaxValue)
                {
                    throw new Exception("Sense's token count out of byte range: " + tokenIdSet.Count.ToString());
                }
                SenseInfo senseInfo = new SenseInfo
                {
                    TokenizedSenseId = senseId,
                    TokensInSense    = (byte)tokenIdSet.Count,
                };
                sii.Instances.Add(senseInfo);
            }
        }
Пример #2
0
 /// <summary>
 /// <para>Looks for query text in tokenized sense, returns corresponding target highlight if found.</para>
 /// <para>If not found (sense doesn't contain query as a sequence), returns null.</para>
 /// </summary>
 private CedictTargetHighlight doFindTargetQuery(ReadOnlyCollection <EquivToken> txtTokenized,
                                                 TokenizedSense ts)
 {
     for (int i = 0; i <= ts.EquivTokens.Count - txtTokenized.Count; ++i)
     {
         int j = 0;
         for (; j != txtTokenized.Count; ++j)
         {
             if (txtTokenized[j].TokenId != ts.EquivTokens[i + j].TokenId)
             {
                 break;
             }
         }
         // If we found full query text: create highlight now
         if (j == txtTokenized.Count)
         {
             // Query is a single token
             if (txtTokenized.Count == 1)
             {
                 return(new CedictTargetHighlight(ts.SenseIx, ts.EquivTokens[i].RunIx,
                                                  ts.EquivTokens[i].StartInRun, ts.EquivTokens[i].LengthInRun));
             }
             // Query is multiple tokens
             else
             {
                 // Sanity check: all tokens in tokenized sense must be from same text run
                 // We don't even index across multiple runs
                 // And definitely don't look up queries that have Hanzi in the middle
                 if (ts.EquivTokens[i].RunIx != ts.EquivTokens[i + j - 1].RunIx)
                 {
                     throw new Exception("Entire query string should be within a single text run in sense's equiv.");
                 }
                 int hlStart = ts.EquivTokens[i].StartInRun;
                 int hlEnd   = ts.EquivTokens[i + j - 1].StartInRun + ts.EquivTokens[i + j - 1].LengthInRun;
                 return(new CedictTargetHighlight(ts.SenseIx, ts.EquivTokens[i].RunIx,
                                                  hlStart, hlEnd - hlStart));
             }
         }
     }
     // Sequence not found
     return(null);
 }
Пример #3
0
        /// <summary>
        /// <para>Verifies if a sense that contains all query tokens is really a match.</para>
        /// </summary>
        /// <param name="txtTokenized">The tokenized query text.</param>
        /// <param name="sensePos">The data position of the tokenized sense to verify.</param>
        /// <param name="entryIdToInfo">Container for kept entry matches.</param>
        /// <param name="br">Binary data source to read up tokenized sense.</param>
        private void doVerifyTarget(ReadOnlyCollection <EquivToken> txtTokenized,
                                    int sensePos,
                                    Dictionary <int, EntryMatchInfo> entryIdToInfo,
                                    BinReader br)
        {
            // Load tokenized sense
            br.Position = sensePos;
            TokenizedSense ts = new TokenizedSense(br);
            // Find query tokens in tokenized sense
            // This will be our highlight too!
            CedictTargetHighlight hilite = doFindTargetQuery(txtTokenized, ts);

            // No highlight: no match
            if (hilite == null)
            {
                return;
            }
            // Score is length of query (in tokens) divided by count of tokense in sense
            float score = ((float)txtTokenized.Count) / ((float)ts.EquivTokens.Count);

            // If we found query string, it's a match; we can go on and record best score and hilight
            if (!entryIdToInfo.ContainsKey(ts.EntryId))
            {
                EntryMatchInfo emi = new EntryMatchInfo
                {
                    EntryId        = ts.EntryId,
                    BestSenseScore = score,
                };
                emi.TargetHilites.Add(hilite);
                entryIdToInfo[ts.EntryId] = emi;
            }
            else
            {
                EntryMatchInfo emi = entryIdToInfo[ts.EntryId];
                if (score > emi.BestSenseScore)
                {
                    emi.BestSenseScore = score;
                }
                emi.TargetHilites.Add(hilite);
            }
        }
Пример #4
0
 /// <summary>
 /// <para>Looks for query text in tokenized sense, returns corresponding target highlight if found.</para>
 /// <para>If not found (sense doesn't contain query as a sequence), returns null.</para>
 /// </summary>
 private CedictTargetHighlight doFindTargetQuery(ReadOnlyCollection<EquivToken> txtTokenized,
     TokenizedSense ts)
 {
     for (int i = 0; i <= ts.EquivTokens.Count - txtTokenized.Count; ++i)
     {
         int j = 0;
         for (; j != txtTokenized.Count; ++j)
         {
             if (txtTokenized[j].TokenId != ts.EquivTokens[i + j].TokenId)
                 break;
         }
         // If we found full query text: create highlight now
         if (j == txtTokenized.Count)
         {
             // Query is a single token
             if (txtTokenized.Count == 1)
             {
                 return new CedictTargetHighlight(ts.SenseIx, ts.EquivTokens[i].RunIx,
                     ts.EquivTokens[i].StartInRun, ts.EquivTokens[i].LengthInRun);
             }
             // Query is multiple tokens
             else
             {
                 // Sanity check: all tokens in tokenized sense must be from same text run
                 // We don't even index across multiple runs
                 // And definitely don't look up queries that have Hanzi in the middle
                 if (ts.EquivTokens[i].RunIx != ts.EquivTokens[i + j - 1].RunIx)
                     throw new Exception("Entire query string should be within a single text run in sense's equiv.");
                 int hlStart = ts.EquivTokens[i].StartInRun;
                 int hlEnd = ts.EquivTokens[i + j - 1].StartInRun + ts.EquivTokens[i + j - 1].LengthInRun;
                 return new CedictTargetHighlight(ts.SenseIx, ts.EquivTokens[i].RunIx,
                     hlStart, hlEnd - hlStart);
             }
         }
     }
     // Sequence not found
     return null;
 }
Пример #5
0
 /// <summary>
 /// <para>Verifies if a sense that contains all query tokens is really a match.</para>
 /// </summary>
 /// <param name="txtTokenized">The tokenized query text.</param>
 /// <param name="sensePos">The data position of the tokenized sense to verify.</param>
 /// <param name="entryIdToInfo">Container for kept entry matches.</param>
 /// <param name="br">Binary data source to read up tokenized sense.</param>
 private void doVerifyTarget(ReadOnlyCollection<EquivToken> txtTokenized,
     int sensePos,
     Dictionary<int, EntryMatchInfo> entryIdToInfo,
     BinReader br)
 {
     // Load tokenized sense
     br.Position = sensePos;
     TokenizedSense ts = new TokenizedSense(br);
     // Find query tokens in tokenized sense
     // This will be our highlight too!
     CedictTargetHighlight hilite = doFindTargetQuery(txtTokenized, ts);
     // No highlight: no match
     if (hilite == null) return;
     // Score is length of query (in tokens) divided by count of tokense in sense
     float score = ((float)txtTokenized.Count) / ((float)ts.EquivTokens.Count);
     // If we found query string, it's a match; we can go on and record best score and hilight
     if (!entryIdToInfo.ContainsKey(ts.EntryId))
     {
         EntryMatchInfo emi = new EntryMatchInfo
         {
             EntryId = ts.EntryId,
             BestSenseScore = score,
         };
         emi.TargetHilites.Add(hilite);
         entryIdToInfo[ts.EntryId] = emi;
     }
     else
     {
         EntryMatchInfo emi = entryIdToInfo[ts.EntryId];
         if (score > emi.BestSenseScore)
             emi.BestSenseScore = score;
         emi.TargetHilites.Add(hilite);
     }
 }
Пример #6
0
        private void indexSense(ReadOnlyCollection<EquivToken> tokens, int entryId, int senseIx)
        {
            // If there are no non-Chinese, non-number tokens: nothing to save, nothing to index
            bool relevant = false;
            foreach (EquivToken eqt in tokens)
            {
                if (eqt.TokenId != index.WordHolder.IdZho && eqt.TokenId != index.WordHolder.IdNum)
                { relevant = true; break; }
            }
            if (!relevant) return;

            // Keep tokenized sense in memory
            int senseId = tsenses.Count;
            TokenizedSense ts = new TokenizedSense(entryId, senseIx, tokens);
            tsenses.Add(ts);
            // Add to instance list of each token in list
            // First get set of different token IDs - we don't index dupes
            HashSet<int> tokenIdSet = new HashSet<int>();
            foreach (EquivToken eqt in tokens) tokenIdSet.Add(eqt.TokenId);
            // Now, index each distinct ID
            foreach (int tokenId in tokenIdSet)
            {
                SenseIndexItem sii;
                if (!index.SenseIndex.ContainsKey(tokenId))
                {
                    sii = new SenseIndexItem();
                    index.SenseIndex[tokenId] = sii;
                }
                else sii = index.SenseIndex[tokenId];
                if (tokenIdSet.Count > byte.MaxValue)
                    throw new Exception("Sense's token count out of byte range: " + tokenIdSet.Count.ToString());
                SenseInfo senseInfo = new SenseInfo
                {
                    TokenizedSenseId = senseId,
                    TokensInSense = (byte)tokenIdSet.Count,
                };
                sii.Instances.Add(senseInfo);
            }
        }