private void indexSense(ReadOnlyCollection <EquivToken> tokens, int entryId, int senseIx) { // If there are no non-Chinese, non-number tokens: nothing to save, nothing to index bool relevant = false; foreach (EquivToken eqt in tokens) { if (eqt.TokenId != index.WordHolder.IdZho && eqt.TokenId != index.WordHolder.IdNum) { relevant = true; break; } } if (!relevant) { return; } // Keep tokenized sense in memory int senseId = tsenses.Count; TokenizedSense ts = new TokenizedSense(entryId, senseIx, tokens); tsenses.Add(ts); // Add to instance list of each token in list // First get set of different token IDs - we don't index dupes HashSet <int> tokenIdSet = new HashSet <int>(); foreach (EquivToken eqt in tokens) { tokenIdSet.Add(eqt.TokenId); } // Now, index each distinct ID foreach (int tokenId in tokenIdSet) { SenseIndexItem sii; if (!index.SenseIndex.ContainsKey(tokenId)) { sii = new SenseIndexItem(); index.SenseIndex[tokenId] = sii; } else { sii = index.SenseIndex[tokenId]; } if (tokenIdSet.Count > byte.MaxValue) { throw new Exception("Sense's token count out of byte range: " + tokenIdSet.Count.ToString()); } SenseInfo senseInfo = new SenseInfo { TokenizedSenseId = senseId, TokensInSense = (byte)tokenIdSet.Count, }; sii.Instances.Add(senseInfo); } }
/// <summary> /// Writes parsed and indexed dictionary to compiled binary file. /// </summary> public void WriteResults(DateTime date, string dictFileName, string statsFolder) { // Cannot do this twice: we'll have replaced entry IDs with file positions in index if (resultsWritten) { throw new InvalidOperationException("WriteResults already called."); } resultsWritten = true; // First, statistics stats.WriteStats(statsFolder); // Start index of Hanzi repository in file int hrepoIdxPos; // ID to file position Dictionary <int, int> entryIdToPos = new Dictionary <int, int>(); Dictionary <int, int> senseIdToPos = new Dictionary <int, int>(); using (BinWriter bw = new BinWriter(dictFileName)) { // Write date and entry count bw.WriteLong(date.Ticks); bw.WriteInt(entries.Count); int returnPos = bw.Position; // Placeholder: will return here to save start position of index at end bw.WriteInt(-1); // Placeholder for hanzi repo position: will return here at very end hrepoIdxPos = bw.Position; bw.WriteInt(-1); // Serialize all entries; fill entry ID -> file pos map for (int i = 0; i != entries.Count; ++i) { entryIdToPos[i] = bw.Position; entries[i].Serialize(bw); } // Replace entry IDs with file positions in all tokenized senses for (int i = 0; i != tsenses.Count; ++i) { tsenses[i].EntryId = entryIdToPos[tsenses[i].EntryId]; } // Serialize all tokenized senses; fill sense ID -> file pos map for (int i = 0; i != tsenses.Count; ++i) { senseIdToPos[i] = bw.Position; tsenses[i].Serialize(bw); } // Fill in index start position int idxPos = bw.Position; bw.Position = returnPos; bw.WriteInt(idxPos); bw.Position = idxPos; // Replace IDs with file positions across index foreach (var x in index.IdeoIndex) { replaceIdsWithPositions(x.Value.EntriesHeadwordSimp, entryIdToPos); replaceIdsWithPositions(x.Value.EntriesHeadwordTrad, entryIdToPos); replaceIdsWithPositions(x.Value.EntriesSense, entryIdToPos); } foreach (var x in index.PinyinIndex) { replaceIdsWithPositions(x.Value.EntriesNT, entryIdToPos); replaceIdsWithPositions(x.Value.Entries0, entryIdToPos); replaceIdsWithPositions(x.Value.Entries1, entryIdToPos); replaceIdsWithPositions(x.Value.Entries2, entryIdToPos); replaceIdsWithPositions(x.Value.Entries3, entryIdToPos); replaceIdsWithPositions(x.Value.Entries4, entryIdToPos); } foreach (var x in index.SenseIndex) { List <SenseInfo> instances = x.Value.Instances; for (int i = 0; i != instances.Count; ++i) { SenseInfo senseInfo = instances[i]; senseInfo.TokenizedSenseId = senseIdToPos[senseInfo.TokenizedSenseId]; instances[i] = senseInfo; } } // Serialize index index.Serialize(bw); // Copy serialized hanzi repository from temp file writeHanziRepo(bw, hrepoIdxPos); } }
private void indexSense(ReadOnlyCollection<EquivToken> tokens, int entryId, int senseIx) { // If there are no non-Chinese, non-number tokens: nothing to save, nothing to index bool relevant = false; foreach (EquivToken eqt in tokens) { if (eqt.TokenId != index.WordHolder.IdZho && eqt.TokenId != index.WordHolder.IdNum) { relevant = true; break; } } if (!relevant) return; // Keep tokenized sense in memory int senseId = tsenses.Count; TokenizedSense ts = new TokenizedSense(entryId, senseIx, tokens); tsenses.Add(ts); // Add to instance list of each token in list // First get set of different token IDs - we don't index dupes HashSet<int> tokenIdSet = new HashSet<int>(); foreach (EquivToken eqt in tokens) tokenIdSet.Add(eqt.TokenId); // Now, index each distinct ID foreach (int tokenId in tokenIdSet) { SenseIndexItem sii; if (!index.SenseIndex.ContainsKey(tokenId)) { sii = new SenseIndexItem(); index.SenseIndex[tokenId] = sii; } else sii = index.SenseIndex[tokenId]; if (tokenIdSet.Count > byte.MaxValue) throw new Exception("Sense's token count out of byte range: " + tokenIdSet.Count.ToString()); SenseInfo senseInfo = new SenseInfo { TokenizedSenseId = senseId, TokensInSense = (byte)tokenIdSet.Count, }; sii.Instances.Add(senseInfo); } }