private void SerializeBytes(byte[] compressed) { using (var stream = new MemoryStream()) using (var deflate = GetOutStream(stream)) { var lastIndex = 1; var items = compressed.Inflate(); var articleNumber = BitConverter.ToInt32(items, 0); deflate.Write(Encoding.ASCII.GetBytes(articleNumber.ToString())); for (int i = 32; i < items.Length; i += 48) { var item = Ngram.FromByteArray(items, i); if (NgramStore.NgramCounts[item.Index] < Options.CutoffPercent * (double)ArticleCount) { continue; } if (i != 0) { deflate.Write(Encoding.ASCII.GetBytes(" ")); } var mapping = OutputIndexMappings[item.Index]; deflate.Write(Encoding.ASCII.GetBytes(mapping + "=" + item.Count)); lastIndex = item.Index; } deflate.Write(Encoding.ASCII.GetBytes("\n")); deflate.Dispose(); if (NgramStore.Articles.Count != 0 && NgramStore.Articles.Count % 1000 == 0) { WriteLine($"Writing an item. {NgramStore.Articles.Count} remaining."); } OutputStrings.Enqueue(stream.ToArray()); } }
byte[] GetNgrams(uint[] article, int articleNumber) { Dictionary <long, Ngram> ngrams = new Dictionary <long, Ngram>(); foreach (var size in Options.Orders) { var length = article.Length; for (int i = 0; i < length - size; i++) { uint[] words = new uint[size]; for (int offset = 0; offset < size; offset++) { words[offset] = article[i + offset]; } var ngramId = Ngram(words); if (ngrams.ContainsKey(ngramId)) { ngrams[ngramId].Count++; } else { int ngramIndex = 0; lock (ngramLock) { if (!NgramIndexes.TryGetValue(ngramId, out ngramIndex)) { NgramIndexes[ngramId] = ngramIndex = lastNgramIndex++; } } ngrams[ngramId] = new Ngram { Count = 1, Index = ngramIndex }; lock (NgramCounts) { NgramCounts.TryGetValue(ngramIndex, out int ngramCount); NgramCounts[ngramIndex] = ++ngramCount; } } } } var raw = new byte[32 + 48 * ngrams.Count]; BitConverter.GetBytes(articleNumber).CopyTo(raw, 0); int byteOffset = 32; foreach (var item in ngrams) { item.Value.ToByteArray().CopyTo(raw, byteOffset); byteOffset += 48; } if (++complete % 1000 == 0) { Console.WriteLine($"Finished n-gramming {complete} of {total}"); } var ret = raw.Deflate(); return(ret); }