Example #1
0
        /// <summary>
        /// Ctor: deserializes binary data.
        /// </summary>
        public Index(BinReader br)
        {
            WordHolder = new CedictEngine.WordHolder(br);
            SenseIndex = new Dictionary <int, SenseIndexItem>();
            int senseIndexKeyCount = br.ReadInt();

            for (int i = 0; i != senseIndexKeyCount; ++i)
            {
                int            tokenId = br.ReadInt();
                SenseIndexItem sii     = new SenseIndexItem(br);
                SenseIndex[tokenId] = sii;
            }

            IdeoIndex   = new Dictionary <char, IdeoIndexItem>();
            PinyinIndex = new Dictionary <string, PinyinIndexItem>();

            int ideoIndexKeyCount = br.ReadInt();

            for (int i = 0; i != ideoIndexKeyCount; ++i)
            {
                char          c   = br.ReadChar();
                IdeoIndexItem iii = new IdeoIndexItem(br);
                IdeoIndex[c] = iii;
            }

            int pinyinIndexKeyCount = br.ReadInt();

            for (int i = 0; i != pinyinIndexKeyCount; ++i)
            {
                string          str = br.ReadString();
                PinyinIndexItem pyi = new PinyinIndexItem(br);
                PinyinIndex[str] = pyi;
            }
        }
Example #2
0
        /// <summary>
        /// Ctor: deserializes binary data.
        /// </summary>
        public Index(BinReader br)
        {
            WordHolder = new CedictEngine.WordHolder(br);
            SenseIndex = new Dictionary<int, SenseIndexItem>();
            int senseIndexKeyCount = br.ReadInt();
            for (int i = 0; i != senseIndexKeyCount; ++i)
            {
                int tokenId = br.ReadInt();
                SenseIndexItem sii = new SenseIndexItem(br);
                SenseIndex[tokenId] = sii;
            }

            IdeoIndex = new Dictionary<char, IdeoIndexItem>();
            PinyinIndex = new Dictionary<string, PinyinIndexItem>();

            int ideoIndexKeyCount = br.ReadInt();
            for (int i = 0; i != ideoIndexKeyCount; ++i)
            {
                char c = br.ReadChar();
                IdeoIndexItem iii = new IdeoIndexItem(br);
                IdeoIndex[c] = iii;
            }

            int pinyinIndexKeyCount = br.ReadInt();
            for (int i = 0; i != pinyinIndexKeyCount; ++i)
            {
                string str = br.ReadString();
                PinyinIndexItem pyi = new PinyinIndexItem(br);
                PinyinIndex[str] = pyi;
            }
        }
Example #3
0
        /// <summary>
        /// Indexes one parsed Cedict entry (hanzi, pinyin and target-language indexes).
        /// </summary>
        private void indexEntry(CedictEntry entry, int id)
        {
            // Collect different chars in both headwords
            HashSet <char> simpSet = new HashSet <char>();

            foreach (char c in entry.ChSimpl)
            {
                simpSet.Add(c);
            }
            if (simpSet.Count > byte.MaxValue)
            {
                throw new Exception("Simplified headword too long; max: 255.");
            }
            byte           simpCount = (byte)simpSet.Count;
            HashSet <char> tradSet   = new HashSet <char>();

            foreach (char c in entry.ChTrad)
            {
                tradSet.Add(c);
            }
            if (tradSet.Count > byte.MaxValue)
            {
                throw new Exception("Traditional headword too long; max: 255.");
            }
            byte tradCount = (byte)tradSet.Count;

            // Index character of simplified headword
            foreach (char c in simpSet)
            {
                IdeoIndexItem ii;
                if (index.IdeoIndex.ContainsKey(c))
                {
                    ii = index.IdeoIndex[c];
                }
                else
                {
                    ii = new IdeoIndexItem();
                    index.IdeoIndex[c] = ii;
                }
                ii.EntriesHeadwordSimp.Add(new IdeoEntryPtr {
                    EntryIdx = id, HwCharCount = simpCount
                });
            }
            // Index characters of traditional headword
            foreach (char c in tradSet)
            {
                IdeoIndexItem ii;
                if (index.IdeoIndex.ContainsKey(c))
                {
                    ii = index.IdeoIndex[c];
                }
                else
                {
                    ii = new IdeoIndexItem();
                    index.IdeoIndex[c] = ii;
                }
                ii.EntriesHeadwordTrad.Add(new IdeoEntryPtr {
                    EntryIdx = id, HwCharCount = tradCount
                });
            }
            // Index pinyin syllables
            foreach (PinyinSyllable pys in entry.Pinyin)
            {
                PinyinIndexItem pi;
                // Index contains lower-case syllables
                string textLo = pys.Text.ToLowerInvariant();
                if (index.PinyinIndex.ContainsKey(textLo))
                {
                    pi = index.PinyinIndex[textLo];
                }
                else
                {
                    pi = new PinyinIndexItem();
                    index.PinyinIndex[textLo] = pi;
                }
                // Figure out which list in index item - by tone
                List <int> entryList;
                if (pys.Tone == -1)
                {
                    entryList = pi.EntriesNT;
                }
                else if (pys.Tone == 0)
                {
                    entryList = pi.Entries0;
                }
                else if (pys.Tone == 1)
                {
                    entryList = pi.Entries1;
                }
                else if (pys.Tone == 2)
                {
                    entryList = pi.Entries2;
                }
                else if (pys.Tone == 3)
                {
                    entryList = pi.Entries3;
                }
                else if (pys.Tone == 4)
                {
                    entryList = pi.Entries4;
                }
                else
                {
                    throw new Exception("Invalid tone: " + pys.Tone.ToString());
                }
                // Avoid indexing same entry twice if a syllable occurs multiple times
                if (entryList.Count == 0 || entryList[entryList.Count - 1] != id)
                {
                    entryList.Add(id);
                }
            }
            // Index equiv of each sense
            int senseIx = -1;

            foreach (CedictSense sense in entry.Senses)
            {
                ++senseIx;
                // Empty equiv: nothing to index
                if (sense.Equiv.IsEmpty)
                {
                    continue;
                }
                // Tokenize
                ReadOnlyCollection <EquivToken> tokens = tokenizer.Tokenize(sense.Equiv);
                // Index sense
                indexSense(tokens, id, senseIx);
            }
        }
Example #4
0
 /// <summary>
 /// Indexes one parsed Cedict entry (hanzi, pinyin and target-language indexes).
 /// </summary>
 private void indexEntry(CedictEntry entry, int id)
 {
     // Index character of simplified headword
     foreach (char c in entry.ChSimpl)
     {
         IdeoIndexItem ii;
         if (index.IdeoIndex.ContainsKey(c)) ii = index.IdeoIndex[c];
         else
         {
             ii = new IdeoIndexItem();
             index.IdeoIndex[c] = ii;
         }
         // Avoid indexing same entry twice if a char occurs multiple times
         if (ii.EntriesHeadwordSimp.Count == 0 ||
             ii.EntriesHeadwordSimp[ii.EntriesHeadwordSimp.Count - 1] != id)
             ii.EntriesHeadwordSimp.Add(id);
     }
     // Index characters of traditional headword
     foreach (char c in entry.ChTrad)
     {
         IdeoIndexItem ii;
         if (index.IdeoIndex.ContainsKey(c)) ii = index.IdeoIndex[c];
         else
         {
             ii = new IdeoIndexItem();
             index.IdeoIndex[c] = ii;
         }
         // Avoid indexing same entry twice if a char occurs multiple times
         if (ii.EntriesHeadwordTrad.Count == 0 ||
             ii.EntriesHeadwordTrad[ii.EntriesHeadwordTrad.Count - 1] != id)
             ii.EntriesHeadwordTrad.Add(id);
     }
     // Index pinyin syllables
     foreach (PinyinSyllable pys in entry.Pinyin)
     {
         PinyinIndexItem pi;
         // Index contains lower-case syllables
         string textLo = pys.Text.ToLowerInvariant();
         if (index.PinyinIndex.ContainsKey(textLo)) pi = index.PinyinIndex[textLo];
         else
         {
             pi = new PinyinIndexItem();
             index.PinyinIndex[textLo] = pi;
         }
         // Figure out which list in index item - by tone
         List<int> entryList;
         if (pys.Tone == -1) entryList = pi.EntriesNT;
         else if (pys.Tone == 0) entryList = pi.Entries0;
         else if (pys.Tone == 1) entryList = pi.Entries1;
         else if (pys.Tone == 2) entryList = pi.Entries2;
         else if (pys.Tone == 3) entryList = pi.Entries3;
         else if (pys.Tone == 4) entryList = pi.Entries4;
         else throw new Exception("Invalid tone: " + pys.Tone.ToString());
         // Avoid indexing same entry twice if a syllable occurs multiple times
         if (entryList.Count == 0 || entryList[entryList.Count - 1] != id)
             entryList.Add(id);
     }
     // Index equiv of each sense
     int senseIx = -1;
     foreach (CedictSense sense in entry.Senses)
     {
         ++senseIx;
         // Empty equiv: nothing to index
         if (sense.Equiv.IsEmpty) continue;
         // Tokenize
         ReadOnlyCollection<EquivToken> tokens = tokenizer.Tokenize(sense.Equiv);
         // Index sense
         indexSense(tokens, id, senseIx);
     }
 }
Example #5
0
        /// <summary>
        /// Retrieves entries (sorted) whose headword contains pinyin from search expression.
        /// </summary>
        List <CedictResult> doPinyinLookupHead(BinReader br, List <PinyinSyllable> sylls)
        {
            // Get every syllable once - we ignore repeats
            // If a syllable occurs with unspecified tone once, or if it occurs with multiple tone marks
            // -> We only take it as one item with unspecified tone
            // Otherwise, take it as is, with tone mark
            Dictionary <string, int> syllDict = new Dictionary <string, int>();

            foreach (var syll in sylls)
            {
                if (!syllDict.ContainsKey(syll.Text))
                {
                    syllDict[syll.Text] = syll.Tone;
                }
                else if (syllDict[syll.Text] != syll.Tone)
                {
                    syllDict[syll.Text] = -1;
                }
            }
            List <PinyinSyllable> querySylls = new List <PinyinSyllable>();

            foreach (var x in syllDict)
            {
                querySylls.Add(new PinyinSyllable(x.Key, x.Value));
            }

            // Map from keys (entry positions) to # of query syllables found in entry
            Dictionary <int, int> posToCount = new Dictionary <int, int>();

            // Look at each query syllable, increment counts for entries in syllable's list(s)
            foreach (PinyinSyllable syll in querySylls)
            {
                // If this syllable is not index, we sure won't have any hits
                if (!index.PinyinIndex.ContainsKey(syll.Text))
                {
                    return(new List <CedictResult>());
                }

                PinyinIndexItem pii = index.PinyinIndex[syll.Text];
                // Query specifies a tone mark: just that list
                if (syll.Tone != -1)
                {
                    List <int> instanceList;
                    if (syll.Tone == 0)
                    {
                        instanceList = pii.Entries0;
                    }
                    else if (syll.Tone == 1)
                    {
                        instanceList = pii.Entries1;
                    }
                    else if (syll.Tone == 2)
                    {
                        instanceList = pii.Entries2;
                    }
                    else if (syll.Tone == 3)
                    {
                        instanceList = pii.Entries3;
                    }
                    else if (syll.Tone == 4)
                    {
                        instanceList = pii.Entries4;
                    }
                    else
                    {
                        throw new Exception("Invalid tone: " + syll.Tone.ToString());
                    }
                    foreach (int pos in instanceList)
                    {
                        if (!posToCount.ContainsKey(pos))
                        {
                            posToCount[pos] = 1;
                        }
                        else
                        {
                            ++posToCount[pos];
                        }
                    }
                }
                // Query does not specify a tone mark
                // Get union of instance vectors, increment each position once
                else
                {
                    HashSet <int> posSet = new HashSet <int>();
                    foreach (int pos in pii.Entries0)
                    {
                        posSet.Add(pos);
                    }
                    foreach (int pos in pii.Entries1)
                    {
                        posSet.Add(pos);
                    }
                    foreach (int pos in pii.Entries2)
                    {
                        posSet.Add(pos);
                    }
                    foreach (int pos in pii.Entries3)
                    {
                        posSet.Add(pos);
                    }
                    foreach (int pos in pii.Entries4)
                    {
                        posSet.Add(pos);
                    }
                    foreach (int pos in pii.EntriesNT)
                    {
                        posSet.Add(pos);
                    }
                    foreach (int pos in posSet)
                    {
                        if (!posToCount.ContainsKey(pos))
                        {
                            posToCount[pos] = 1;
                        }
                        else
                        {
                            ++posToCount[pos];
                        }
                    }
                }
            }
            // Get positions that contain all chars from query
            HashSet <int> matchingPositions = new HashSet <int>();

            foreach (var x in posToCount)
            {
                if (x.Value == querySylls.Count)
                {
                    matchingPositions.Add(x.Key);
                }
            }
            // Now fetch and verify results
            List <ResWithEntry> resWE = doLoadVerifyPinyin(br, matchingPositions, sylls);

            // Sort pinyin results
            resWE.Sort((a, b) => pyComp(a, b));
            // Done.
            List <CedictResult> res = new List <CedictResult>(resWE.Capacity);

            for (int i = 0; i != resWE.Count; ++i)
            {
                res.Add(resWE[i].Res);
            }
            return(res);
        }