private void SerializeBytes(byte[] compressed)
        {
            using (var stream = new MemoryStream())
                using (var deflate = GetOutStream(stream))
                {
                    var lastIndex = 1;

                    var items         = compressed.Inflate();
                    var articleNumber = BitConverter.ToInt32(items, 0);

                    deflate.Write(Encoding.ASCII.GetBytes(articleNumber.ToString()));

                    for (int i = 32; i < items.Length; i += 48)
                    {
                        var item = Ngram.FromByteArray(items, i);

                        if (NgramStore.NgramCounts[item.Index] < Options.CutoffPercent * (double)ArticleCount)
                        {
                            continue;
                        }

                        if (i != 0)
                        {
                            deflate.Write(Encoding.ASCII.GetBytes(" "));
                        }

                        var mapping = OutputIndexMappings[item.Index];
                        deflate.Write(Encoding.ASCII.GetBytes(mapping + "=" + item.Count));
                        lastIndex = item.Index;
                    }

                    deflate.Write(Encoding.ASCII.GetBytes("\n"));
                    deflate.Dispose();

                    if (NgramStore.Articles.Count != 0 && NgramStore.Articles.Count % 1000 == 0)
                    {
                        WriteLine($"Writing an item. {NgramStore.Articles.Count} remaining.");
                    }

                    OutputStrings.Enqueue(stream.ToArray());
                }
        }
Beispiel #2
0
        byte[] GetNgrams(uint[] article, int articleNumber)
        {
            Dictionary <long, Ngram> ngrams = new Dictionary <long, Ngram>();

            foreach (var size in Options.Orders)
            {
                var length = article.Length;
                for (int i = 0; i < length - size; i++)
                {
                    uint[] words = new uint[size];
                    for (int offset = 0; offset < size; offset++)
                    {
                        words[offset] = article[i + offset];
                    }
                    var ngramId = Ngram(words);

                    if (ngrams.ContainsKey(ngramId))
                    {
                        ngrams[ngramId].Count++;
                    }
                    else
                    {
                        int ngramIndex = 0;
                        lock (ngramLock)
                        {
                            if (!NgramIndexes.TryGetValue(ngramId, out ngramIndex))
                            {
                                NgramIndexes[ngramId] = ngramIndex = lastNgramIndex++;
                            }
                        }

                        ngrams[ngramId] = new Ngram {
                            Count = 1, Index = ngramIndex
                        };
                        lock (NgramCounts)
                        {
                            NgramCounts.TryGetValue(ngramIndex, out int ngramCount);
                            NgramCounts[ngramIndex] = ++ngramCount;
                        }
                    }
                }
            }

            var raw = new byte[32 + 48 * ngrams.Count];

            BitConverter.GetBytes(articleNumber).CopyTo(raw, 0);
            int byteOffset = 32;

            foreach (var item in ngrams)
            {
                item.Value.ToByteArray().CopyTo(raw, byteOffset);
                byteOffset += 48;
            }

            if (++complete % 1000 == 0)
            {
                Console.WriteLine($"Finished n-gramming {complete} of {total}");
            }

            var ret = raw.Deflate();

            return(ret);
        }