Beispiel #1
0
        /// <summary>
        /// Ctor: deserializes binary data.
        /// </summary>
        public Index(BinReader br)
        {
            WordHolder = new CedictEngine.WordHolder(br);
            SenseIndex = new Dictionary <int, SenseIndexItem>();
            int senseIndexKeyCount = br.ReadInt();

            for (int i = 0; i != senseIndexKeyCount; ++i)
            {
                int            tokenId = br.ReadInt();
                SenseIndexItem sii     = new SenseIndexItem(br);
                SenseIndex[tokenId] = sii;
            }

            IdeoIndex   = new Dictionary <char, IdeoIndexItem>();
            PinyinIndex = new Dictionary <string, PinyinIndexItem>();

            int ideoIndexKeyCount = br.ReadInt();

            for (int i = 0; i != ideoIndexKeyCount; ++i)
            {
                char          c   = br.ReadChar();
                IdeoIndexItem iii = new IdeoIndexItem(br);
                IdeoIndex[c] = iii;
            }

            int pinyinIndexKeyCount = br.ReadInt();

            for (int i = 0; i != pinyinIndexKeyCount; ++i)
            {
                string          str = br.ReadString();
                PinyinIndexItem pyi = new PinyinIndexItem(br);
                PinyinIndex[str] = pyi;
            }
        }
Beispiel #2
0
        /// <summary>
        /// Indexes one parsed Cedict entry (hanzi, pinyin and target-language indexes).
        /// </summary>
        private void indexEntry(CedictEntry entry, int id)
        {
            // Collect different chars in both headwords
            HashSet <char> simpSet = new HashSet <char>();

            foreach (char c in entry.ChSimpl)
            {
                simpSet.Add(c);
            }
            if (simpSet.Count > byte.MaxValue)
            {
                throw new Exception("Simplified headword too long; max: 255.");
            }
            byte           simpCount = (byte)simpSet.Count;
            HashSet <char> tradSet   = new HashSet <char>();

            foreach (char c in entry.ChTrad)
            {
                tradSet.Add(c);
            }
            if (tradSet.Count > byte.MaxValue)
            {
                throw new Exception("Traditional headword too long; max: 255.");
            }
            byte tradCount = (byte)tradSet.Count;

            // Index character of simplified headword
            foreach (char c in simpSet)
            {
                IdeoIndexItem ii;
                if (index.IdeoIndex.ContainsKey(c))
                {
                    ii = index.IdeoIndex[c];
                }
                else
                {
                    ii = new IdeoIndexItem();
                    index.IdeoIndex[c] = ii;
                }
                ii.EntriesHeadwordSimp.Add(new IdeoEntryPtr {
                    EntryIdx = id, HwCharCount = simpCount
                });
            }
            // Index characters of traditional headword
            foreach (char c in tradSet)
            {
                IdeoIndexItem ii;
                if (index.IdeoIndex.ContainsKey(c))
                {
                    ii = index.IdeoIndex[c];
                }
                else
                {
                    ii = new IdeoIndexItem();
                    index.IdeoIndex[c] = ii;
                }
                ii.EntriesHeadwordTrad.Add(new IdeoEntryPtr {
                    EntryIdx = id, HwCharCount = tradCount
                });
            }
            // Index pinyin syllables
            foreach (PinyinSyllable pys in entry.Pinyin)
            {
                PinyinIndexItem pi;
                // Index contains lower-case syllables
                string textLo = pys.Text.ToLowerInvariant();
                if (index.PinyinIndex.ContainsKey(textLo))
                {
                    pi = index.PinyinIndex[textLo];
                }
                else
                {
                    pi = new PinyinIndexItem();
                    index.PinyinIndex[textLo] = pi;
                }
                // Figure out which list in index item - by tone
                List <int> entryList;
                if (pys.Tone == -1)
                {
                    entryList = pi.EntriesNT;
                }
                else if (pys.Tone == 0)
                {
                    entryList = pi.Entries0;
                }
                else if (pys.Tone == 1)
                {
                    entryList = pi.Entries1;
                }
                else if (pys.Tone == 2)
                {
                    entryList = pi.Entries2;
                }
                else if (pys.Tone == 3)
                {
                    entryList = pi.Entries3;
                }
                else if (pys.Tone == 4)
                {
                    entryList = pi.Entries4;
                }
                else
                {
                    throw new Exception("Invalid tone: " + pys.Tone.ToString());
                }
                // Avoid indexing same entry twice if a syllable occurs multiple times
                if (entryList.Count == 0 || entryList[entryList.Count - 1] != id)
                {
                    entryList.Add(id);
                }
            }
            // Index equiv of each sense
            int senseIx = -1;

            foreach (CedictSense sense in entry.Senses)
            {
                ++senseIx;
                // Empty equiv: nothing to index
                if (sense.Equiv.IsEmpty)
                {
                    continue;
                }
                // Tokenize
                ReadOnlyCollection <EquivToken> tokens = tokenizer.Tokenize(sense.Equiv);
                // Index sense
                indexSense(tokens, id, senseIx);
            }
        }
Beispiel #3
0
        /// <summary>
        /// Retrieves entries (sorted) whose headword contains hanzi from search expression.
        /// </summary>
        List <CedictResult> doHanziLookupHead(BinReader br, string query, SearchScript script)
        {
            // Get every character once - we ignore repeats
            HashSet <char> queryChars = new HashSet <char>();

            foreach (char c in query)
            {
                queryChars.Add(c);
            }
            // Map from keys (entry positions) to # of query chars found in entry
            Dictionary <int, int> posToCountSimp = new Dictionary <int, int>();
            Dictionary <int, int> posToCountTrad = new Dictionary <int, int>();

            // Look at each character's entry position vector, increment counts
            foreach (char c in queryChars)
            {
                // If there's a hanzi that's not in index, we'll sure have not hits!
                if (!index.IdeoIndex.ContainsKey(c))
                {
                    return(new List <CedictResult>());
                }

                IdeoIndexItem iii = index.IdeoIndex[c];
                // Count separately for simplified and traditional
                foreach (var iep in iii.EntriesHeadwordSimp)
                {
                    if (posToCountSimp.ContainsKey(iep.EntryIdx))
                    {
                        ++posToCountSimp[iep.EntryIdx];
                    }
                    else
                    {
                        posToCountSimp[iep.EntryIdx] = 1;
                    }
                }
                foreach (var iep in iii.EntriesHeadwordTrad)
                {
                    if (posToCountTrad.ContainsKey(iep.EntryIdx))
                    {
                        ++posToCountTrad[iep.EntryIdx];
                    }
                    else
                    {
                        posToCountTrad[iep.EntryIdx] = 1;
                    }
                }
            }
            // Get positions that contain all chars from query
            HashSet <int> matchingPositions = new HashSet <int>();

            foreach (var x in posToCountSimp)
            {
                if (x.Value == queryChars.Count)
                {
                    matchingPositions.Add(x.Key);
                }
            }
            foreach (var x in posToCountTrad)
            {
                if (x.Value == queryChars.Count)
                {
                    matchingPositions.Add(x.Key);
                }
            }
            // Now fetch and verify results
            List <ResWithEntry> resWE = doLoadVerifyHanzi(br, matchingPositions, query, script);

            // Sort hanzi results
            resWE.Sort((a, b) => hrComp(a, b));
            // Done.
            List <CedictResult> res = new List <CedictResult>(resWE.Capacity);

            for (int i = 0; i != resWE.Count; ++i)
            {
                res.Add(resWE[i].Res);
            }
            return(res);
        }