Пример #1
0
        private void indexSense(ReadOnlyCollection <EquivToken> tokens, int entryId, int senseIx)
        {
            // If there are no non-Chinese, non-number tokens: nothing to save, nothing to index
            bool relevant = false;

            foreach (EquivToken eqt in tokens)
            {
                if (eqt.TokenId != index.WordHolder.IdZho && eqt.TokenId != index.WordHolder.IdNum)
                {
                    relevant = true; break;
                }
            }
            if (!relevant)
            {
                return;
            }

            // Keep tokenized sense in memory
            int            senseId = tsenses.Count;
            TokenizedSense ts      = new TokenizedSense(entryId, senseIx, tokens);

            tsenses.Add(ts);
            // Add to instance list of each token in list
            // First get set of different token IDs - we don't index dupes
            HashSet <int> tokenIdSet = new HashSet <int>();

            foreach (EquivToken eqt in tokens)
            {
                tokenIdSet.Add(eqt.TokenId);
            }
            // Now, index each distinct ID
            foreach (int tokenId in tokenIdSet)
            {
                SenseIndexItem sii;
                if (!index.SenseIndex.ContainsKey(tokenId))
                {
                    sii = new SenseIndexItem();
                    index.SenseIndex[tokenId] = sii;
                }
                else
                {
                    sii = index.SenseIndex[tokenId];
                }
                if (tokenIdSet.Count > byte.MaxValue)
                {
                    throw new Exception("Sense's token count out of byte range: " + tokenIdSet.Count.ToString());
                }
                SenseInfo senseInfo = new SenseInfo
                {
                    TokenizedSenseId = senseId,
                    TokensInSense    = (byte)tokenIdSet.Count,
                };
                sii.Instances.Add(senseInfo);
            }
        }
Пример #2
0
        /// <summary>
        /// Writes parsed and indexed dictionary to compiled binary file.
        /// </summary>
        public void WriteResults(DateTime date, string dictFileName, string statsFolder)
        {
            // Cannot do this twice: we'll have replaced entry IDs with file positions in index
            if (resultsWritten)
            {
                throw new InvalidOperationException("WriteResults already called.");
            }
            resultsWritten = true;

            // First, statistics
            stats.WriteStats(statsFolder);

            // Start index of Hanzi repository in file
            int hrepoIdxPos;

            // ID to file position
            Dictionary <int, int> entryIdToPos = new Dictionary <int, int>();
            Dictionary <int, int> senseIdToPos = new Dictionary <int, int>();

            using (BinWriter bw = new BinWriter(dictFileName))
            {
                // Write date and entry count
                bw.WriteLong(date.Ticks);
                bw.WriteInt(entries.Count);
                int returnPos = bw.Position;
                // Placeholder: will return here to save start position of index at end
                bw.WriteInt(-1);
                // Placeholder for hanzi repo position: will return here at very end
                hrepoIdxPos = bw.Position;
                bw.WriteInt(-1);
                // Serialize all entries; fill entry ID -> file pos map
                for (int i = 0; i != entries.Count; ++i)
                {
                    entryIdToPos[i] = bw.Position;
                    entries[i].Serialize(bw);
                }
                // Replace entry IDs with file positions in all tokenized senses
                for (int i = 0; i != tsenses.Count; ++i)
                {
                    tsenses[i].EntryId = entryIdToPos[tsenses[i].EntryId];
                }
                // Serialize all tokenized senses; fill sense ID -> file pos map
                for (int i = 0; i != tsenses.Count; ++i)
                {
                    senseIdToPos[i] = bw.Position;
                    tsenses[i].Serialize(bw);
                }
                // Fill in index start position
                int idxPos = bw.Position;
                bw.Position = returnPos;
                bw.WriteInt(idxPos);
                bw.Position = idxPos;
                // Replace IDs with file positions across index
                foreach (var x in index.IdeoIndex)
                {
                    replaceIdsWithPositions(x.Value.EntriesHeadwordSimp, entryIdToPos);
                    replaceIdsWithPositions(x.Value.EntriesHeadwordTrad, entryIdToPos);
                    replaceIdsWithPositions(x.Value.EntriesSense, entryIdToPos);
                }
                foreach (var x in index.PinyinIndex)
                {
                    replaceIdsWithPositions(x.Value.EntriesNT, entryIdToPos);
                    replaceIdsWithPositions(x.Value.Entries0, entryIdToPos);
                    replaceIdsWithPositions(x.Value.Entries1, entryIdToPos);
                    replaceIdsWithPositions(x.Value.Entries2, entryIdToPos);
                    replaceIdsWithPositions(x.Value.Entries3, entryIdToPos);
                    replaceIdsWithPositions(x.Value.Entries4, entryIdToPos);
                }
                foreach (var x in index.SenseIndex)
                {
                    List <SenseInfo> instances = x.Value.Instances;
                    for (int i = 0; i != instances.Count; ++i)
                    {
                        SenseInfo senseInfo = instances[i];
                        senseInfo.TokenizedSenseId = senseIdToPos[senseInfo.TokenizedSenseId];
                        instances[i] = senseInfo;
                    }
                }
                // Serialize index
                index.Serialize(bw);
                // Copy serialized hanzi repository from temp file
                writeHanziRepo(bw, hrepoIdxPos);
            }
        }
Пример #3
0
        private void indexSense(ReadOnlyCollection<EquivToken> tokens, int entryId, int senseIx)
        {
            // If there are no non-Chinese, non-number tokens: nothing to save, nothing to index
            bool relevant = false;
            foreach (EquivToken eqt in tokens)
            {
                if (eqt.TokenId != index.WordHolder.IdZho && eqt.TokenId != index.WordHolder.IdNum)
                { relevant = true; break; }
            }
            if (!relevant) return;

            // Keep tokenized sense in memory
            int senseId = tsenses.Count;
            TokenizedSense ts = new TokenizedSense(entryId, senseIx, tokens);
            tsenses.Add(ts);
            // Add to instance list of each token in list
            // First get set of different token IDs - we don't index dupes
            HashSet<int> tokenIdSet = new HashSet<int>();
            foreach (EquivToken eqt in tokens) tokenIdSet.Add(eqt.TokenId);
            // Now, index each distinct ID
            foreach (int tokenId in tokenIdSet)
            {
                SenseIndexItem sii;
                if (!index.SenseIndex.ContainsKey(tokenId))
                {
                    sii = new SenseIndexItem();
                    index.SenseIndex[tokenId] = sii;
                }
                else sii = index.SenseIndex[tokenId];
                if (tokenIdSet.Count > byte.MaxValue)
                    throw new Exception("Sense's token count out of byte range: " + tokenIdSet.Count.ToString());
                SenseInfo senseInfo = new SenseInfo
                {
                    TokenizedSenseId = senseId,
                    TokensInSense = (byte)tokenIdSet.Count,
                };
                sii.Instances.Add(senseInfo);
            }
        }