コード例 #1
0
        public byte[] ToArray()
        {
            var ret = new byte[84 + HashCount.Size + FlagByteCount.Size + FlagByteCount + (32 * HashCount)];

            var bh = BlockHeader.ToArray();

            Buffer.BlockCopy(bh, 0, ret, 0, bh.Length);

            var tr = BitConverter.GetBytes(Transactions);

            Buffer.BlockCopy(tr, 0, ret, 80, tr.Length);

            var hc = HashCount.ToArray();

            Buffer.BlockCopy(hc, 0, ret, 84, hc.Length);

            for (var x = 0; x < HashCount; x++)
            {
                var hx = Hashes[x].ToArray();
                Buffer.BlockCopy(hx, 0, ret, 84 + hc.Length + (x * 32), hx.Length);
            }

            var fb = FlagByteCount.ToArray();

            Buffer.BlockCopy(fb, 0, ret, 84 + hc.Length + (32 * HashCount), fb.Length);

            Buffer.BlockCopy(Flags, 0, ret, 84 + hc.Length + (32 * HashCount) + fb.Length, Flags.Length);

            return(ret);
        }
コード例 #2
0
        public byte[] ToArray()
        {
            var woffset = 0;
            var ret     = new byte[Size];

            ret.CopyAndIncr(BitConverter.GetBytes(Version), ref woffset);
            ret.CopyAndIncr(HashCount.ToArray(), ref woffset);

            foreach (var hash in Hashes)
            {
                ret.CopyAndIncr(hash.ToArray(), ref woffset);
            }

            ret.CopyAndIncr(StopHash.ToArray(), woffset);

            return(ret);
        }
コード例 #3
0
ファイル: MerkleBlock.cs プロジェクト: hashstream/bitcoin-lib
        public byte[] ToArray()
        {
            var woffset = 0;
            var ret     = new byte[Size];

            ret.CopyAndIncr(BlockHeader.ToArray(), ref woffset);
            ret.CopyAndIncr(BitConverter.GetBytes(Transactions), ref woffset);
            ret.CopyAndIncr(HashCount.ToArray(), ref woffset);

            foreach (var h in Hashes)
            {
                ret.CopyAndIncr(h.ToArray(), ref woffset);
            }

            ret.CopyAndIncr(FlagBytes.ToArray(), ref woffset);
            ret.CopyAndIncr(Flags, woffset);

            return(ret);
        }
コード例 #4
0
        public byte[] ToArray()
        {
            using (var ms = new MemoryStream())
            {
                var v = BitConverter.GetBytes(Version);
                ms.Write(v, 0, v.Length);

                var hc = HashCount.ToArray();
                ms.Write(hc, 0, hc.Length);

                for (var x = 0; x < HashCount; x++)
                {
                    var xh = Hashes[x].HashBytes;
                    ms.Write(xh, 0, xh.Length);
                }

                ms.Write(StopHash.HashBytes, 0, StopHash.HashBytes.Length);

                return(ms.ToArray());
            }
        }
コード例 #5
0
ファイル: Spotter.cs プロジェクト: uzbekdev1/catalyst
        public void TrainWord2Sense(IEnumerable <IDocument> documents, ParallelOptions parallelOptions, int ngrams = 3, double tooRare = 1E-5, double tooCommon = 0.1, Word2SenseTrainingData trainingData = null)
        {
            var HashCount = new ConcurrentDictionary <ulong, int>();
            var Senses    = new ConcurrentDictionary <ulong, ulong[]>();
            var Words     = new ConcurrentDictionary <ulong, string>();

            if (trainingData is object)
            {
                HashCount = new ConcurrentDictionary <ulong, int>(trainingData.HashCount);
                Senses    = new ConcurrentDictionary <ulong, ulong[]>(trainingData.Senses);
                Words     = new ConcurrentDictionary <ulong, string>(trainingData.Words);
            }

            bool ignoreCase        = Data.IgnoreCase;
            bool ignoreOnlyNumeric = Data.IgnoreOnlyNumeric;
            var  stopwords         = new HashSet <ulong>(StopWords.Spacy.For(Language).Select(w => ignoreCase ? IgnoreCaseHash64(w.AsSpan()) : Hash64(w.AsSpan())).ToArray());


            int docCount = 0, tkCount = 0;

            var sw = Stopwatch.StartNew();

            TrainLock.EnterWriteLock();
            try
            {
                Parallel.ForEach(documents, parallelOptions, doc =>
                {
                    try
                    {
                        var stack = new Queue <ulong>(ngrams);

                        if (doc.TokensCount < ngrams)
                        {
                            return;
                        }                                         //Ignore too small documents

                        Interlocked.Add(ref tkCount, doc.TokensCount);
                        foreach (var span in doc)
                        {
                            var tokens = span.GetCapturedTokens().ToArray();

                            for (int i = 0; i < tokens.Length; i++)
                            {
                                var tk = tokens[i];

                                var hash = ignoreCase ? IgnoreCaseHash64(tk.ValueAsSpan) : Hash64(tk.ValueAsSpan);

                                bool filterPartOfSpeech = !(tk.POS == PartOfSpeech.ADJ || tk.POS == PartOfSpeech.NOUN || tk.POS == PartOfSpeech.PROPN);

                                bool skipIfHasUpperCase = (!ignoreCase && !tk.ValueAsSpan.IsAllLowerCase());

                                bool skipIfTooSmall = (tk.Length < 3);

                                bool skipIfNotAllLetterOrDigit = !(tk.ValueAsSpan.IsAllLetterOrDigit());

                                bool skipIfStopWordOrEntity = stopwords.Contains(hash) || tk.EntityTypes.Any();

                                //Heuristic for ordinal numbers (i.e. 1st, 2nd, 33rd, etc)
                                bool skipIfMaybeOrdinal = (tk.ValueAsSpan.IndexOfAny(new char[] { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' }, 0) >= 0 &&
                                                           tk.ValueAsSpan.IndexOfAny(new char[] { 't', 'h', 's', 't', 'r', 'd' }, 0) >= 0 &&
                                                           tk.ValueAsSpan.IndexOfAny(new char[] { 'a', 'b', 'c', 'e', 'f', 'g', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'u', 'v', 'w', 'x', 'y', 'z' }, 0) < 0);

                                bool skipIfOnlyNumeric = ignoreOnlyNumeric ? tk.ValueAsSpan.IsLetter() : false;

                                //Only filter for POS if language != any, as otherwise we won't have the POS information
                                bool skipThisToken = (filterPartOfSpeech && Language != Language.Any) || skipIfHasUpperCase || skipIfTooSmall || skipIfNotAllLetterOrDigit || skipIfStopWordOrEntity || skipIfMaybeOrdinal || skipIfOnlyNumeric;

                                if (skipThisToken)
                                {
                                    stack.Clear();
                                    continue;
                                }

                                if (!Words.ContainsKey(hash))
                                {
                                    Words[hash] = ignoreCase ? tk.Value.ToLowerInvariant() : tk.Value;
                                }

                                stack.Enqueue(hash);
                                ulong combined = stack.ElementAt(0);

                                for (int j = 1; j < stack.Count; j++)
                                {
                                    combined = HashCombine64(combined, stack.ElementAt(j));
                                    if (HashCount.ContainsKey(combined))
                                    {
                                        HashCount[combined]++;
                                    }
                                    else
                                    {
                                        Senses[combined]    = stack.Take(j + 1).ToArray();
                                        HashCount[combined] = 1;
                                    }
                                }

                                if (stack.Count > ngrams)
                                {
                                    stack.Dequeue();
                                }
                            }
                        }

                        int count = Interlocked.Increment(ref docCount);

                        if (count % 1000 == 0)
                        {
                            Logger.LogInformation("Training Word2Sense model - at {DOCCOUNT} documents, {TKCOUNT} tokens - elapsed {ELAPSED} seconds at {KTKS} kTk/s)", docCount, tkCount, sw.Elapsed.TotalSeconds, (tkCount / sw.ElapsedMilliseconds));
                        }
                    }
                    catch (Exception E)
                    {
                        Logger.LogError(E, "Error during training Word2Sense model");
                    }
                });
            }
            catch (OperationCanceledException)
            {
                return;
            }
            finally
            {
                TrainLock.ExitWriteLock();
            }

            Logger.LogInformation("Finish parsing documents for Word2Sense model");

            int thresholdRare   = (int)Math.Floor(tooRare * docCount);
            int thresholdCommon = (int)Math.Floor(tooCommon * docCount);

            var toKeep = HashCount.Where(kv => kv.Value >= thresholdRare && kv.Value <= thresholdCommon).OrderByDescending(kv => kv.Value)
                         .Select(kv => kv.Key).ToArray();

            foreach (var key in toKeep)
            {
                if (Senses.TryGetValue(key, out var hashes) && HashCount.TryGetValue(key, out var count))
                {
                    Data.Hashes.Add(key);
                    for (int i = 0; i < hashes.Length; i++)
                    {
                        if (Data.MultiGramHashes.Count <= i)
                        {
                            Data.MultiGramHashes.Add(new HashSet <ulong>());
                        }
                        Data.MultiGramHashes[i].Add(hashes[i]);
                    }
                }
            }

            if (trainingData is object)
            {
                trainingData.HashCount = new Dictionary <ulong, int>(HashCount);
                trainingData.Senses    = new Dictionary <ulong, ulong[]>(Senses);
                trainingData.Words     = new Dictionary <ulong, string>(Words);

                foreach (var word in trainingData.Words.Values)
                {
                    AddToGazeteer(word);
                }
            }

            Logger.LogInformation("Finish training Word2Sense model");
        }