public byte[] ToArray() { var ret = new byte[84 + HashCount.Size + FlagByteCount.Size + FlagByteCount + (32 * HashCount)]; var bh = BlockHeader.ToArray(); Buffer.BlockCopy(bh, 0, ret, 0, bh.Length); var tr = BitConverter.GetBytes(Transactions); Buffer.BlockCopy(tr, 0, ret, 80, tr.Length); var hc = HashCount.ToArray(); Buffer.BlockCopy(hc, 0, ret, 84, hc.Length); for (var x = 0; x < HashCount; x++) { var hx = Hashes[x].ToArray(); Buffer.BlockCopy(hx, 0, ret, 84 + hc.Length + (x * 32), hx.Length); } var fb = FlagByteCount.ToArray(); Buffer.BlockCopy(fb, 0, ret, 84 + hc.Length + (32 * HashCount), fb.Length); Buffer.BlockCopy(Flags, 0, ret, 84 + hc.Length + (32 * HashCount) + fb.Length, Flags.Length); return(ret); }
public byte[] ToArray() { var woffset = 0; var ret = new byte[Size]; ret.CopyAndIncr(BitConverter.GetBytes(Version), ref woffset); ret.CopyAndIncr(HashCount.ToArray(), ref woffset); foreach (var hash in Hashes) { ret.CopyAndIncr(hash.ToArray(), ref woffset); } ret.CopyAndIncr(StopHash.ToArray(), woffset); return(ret); }
public byte[] ToArray() { var woffset = 0; var ret = new byte[Size]; ret.CopyAndIncr(BlockHeader.ToArray(), ref woffset); ret.CopyAndIncr(BitConverter.GetBytes(Transactions), ref woffset); ret.CopyAndIncr(HashCount.ToArray(), ref woffset); foreach (var h in Hashes) { ret.CopyAndIncr(h.ToArray(), ref woffset); } ret.CopyAndIncr(FlagBytes.ToArray(), ref woffset); ret.CopyAndIncr(Flags, woffset); return(ret); }
public byte[] ToArray() { using (var ms = new MemoryStream()) { var v = BitConverter.GetBytes(Version); ms.Write(v, 0, v.Length); var hc = HashCount.ToArray(); ms.Write(hc, 0, hc.Length); for (var x = 0; x < HashCount; x++) { var xh = Hashes[x].HashBytes; ms.Write(xh, 0, xh.Length); } ms.Write(StopHash.HashBytes, 0, StopHash.HashBytes.Length); return(ms.ToArray()); } }
public void TrainWord2Sense(IEnumerable <IDocument> documents, ParallelOptions parallelOptions, int ngrams = 3, double tooRare = 1E-5, double tooCommon = 0.1, Word2SenseTrainingData trainingData = null) { var HashCount = new ConcurrentDictionary <ulong, int>(); var Senses = new ConcurrentDictionary <ulong, ulong[]>(); var Words = new ConcurrentDictionary <ulong, string>(); if (trainingData is object) { HashCount = new ConcurrentDictionary <ulong, int>(trainingData.HashCount); Senses = new ConcurrentDictionary <ulong, ulong[]>(trainingData.Senses); Words = new ConcurrentDictionary <ulong, string>(trainingData.Words); } bool ignoreCase = Data.IgnoreCase; bool ignoreOnlyNumeric = Data.IgnoreOnlyNumeric; var stopwords = new HashSet <ulong>(StopWords.Spacy.For(Language).Select(w => ignoreCase ? IgnoreCaseHash64(w.AsSpan()) : Hash64(w.AsSpan())).ToArray()); int docCount = 0, tkCount = 0; var sw = Stopwatch.StartNew(); TrainLock.EnterWriteLock(); try { Parallel.ForEach(documents, parallelOptions, doc => { try { var stack = new Queue <ulong>(ngrams); if (doc.TokensCount < ngrams) { return; } //Ignore too small documents Interlocked.Add(ref tkCount, doc.TokensCount); foreach (var span in doc) { var tokens = span.GetCapturedTokens().ToArray(); for (int i = 0; i < tokens.Length; i++) { var tk = tokens[i]; var hash = ignoreCase ? IgnoreCaseHash64(tk.ValueAsSpan) : Hash64(tk.ValueAsSpan); bool filterPartOfSpeech = !(tk.POS == PartOfSpeech.ADJ || tk.POS == PartOfSpeech.NOUN || tk.POS == PartOfSpeech.PROPN); bool skipIfHasUpperCase = (!ignoreCase && !tk.ValueAsSpan.IsAllLowerCase()); bool skipIfTooSmall = (tk.Length < 3); bool skipIfNotAllLetterOrDigit = !(tk.ValueAsSpan.IsAllLetterOrDigit()); bool skipIfStopWordOrEntity = stopwords.Contains(hash) || tk.EntityTypes.Any(); //Heuristic for ordinal numbers (i.e. 1st, 2nd, 33rd, etc) bool skipIfMaybeOrdinal = (tk.ValueAsSpan.IndexOfAny(new char[] { '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' }, 0) >= 0 && tk.ValueAsSpan.IndexOfAny(new char[] { 't', 'h', 's', 't', 'r', 'd' }, 0) >= 0 && tk.ValueAsSpan.IndexOfAny(new char[] { 'a', 'b', 'c', 'e', 'f', 'g', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'u', 'v', 'w', 'x', 'y', 'z' }, 0) < 0); bool skipIfOnlyNumeric = ignoreOnlyNumeric ? tk.ValueAsSpan.IsLetter() : false; //Only filter for POS if language != any, as otherwise we won't have the POS information bool skipThisToken = (filterPartOfSpeech && Language != Language.Any) || skipIfHasUpperCase || skipIfTooSmall || skipIfNotAllLetterOrDigit || skipIfStopWordOrEntity || skipIfMaybeOrdinal || skipIfOnlyNumeric; if (skipThisToken) { stack.Clear(); continue; } if (!Words.ContainsKey(hash)) { Words[hash] = ignoreCase ? tk.Value.ToLowerInvariant() : tk.Value; } stack.Enqueue(hash); ulong combined = stack.ElementAt(0); for (int j = 1; j < stack.Count; j++) { combined = HashCombine64(combined, stack.ElementAt(j)); if (HashCount.ContainsKey(combined)) { HashCount[combined]++; } else { Senses[combined] = stack.Take(j + 1).ToArray(); HashCount[combined] = 1; } } if (stack.Count > ngrams) { stack.Dequeue(); } } } int count = Interlocked.Increment(ref docCount); if (count % 1000 == 0) { Logger.LogInformation("Training Word2Sense model - at {DOCCOUNT} documents, {TKCOUNT} tokens - elapsed {ELAPSED} seconds at {KTKS} kTk/s)", docCount, tkCount, sw.Elapsed.TotalSeconds, (tkCount / sw.ElapsedMilliseconds)); } } catch (Exception E) { Logger.LogError(E, "Error during training Word2Sense model"); } }); } catch (OperationCanceledException) { return; } finally { TrainLock.ExitWriteLock(); } Logger.LogInformation("Finish parsing documents for Word2Sense model"); int thresholdRare = (int)Math.Floor(tooRare * docCount); int thresholdCommon = (int)Math.Floor(tooCommon * docCount); var toKeep = HashCount.Where(kv => kv.Value >= thresholdRare && kv.Value <= thresholdCommon).OrderByDescending(kv => kv.Value) .Select(kv => kv.Key).ToArray(); foreach (var key in toKeep) { if (Senses.TryGetValue(key, out var hashes) && HashCount.TryGetValue(key, out var count)) { Data.Hashes.Add(key); for (int i = 0; i < hashes.Length; i++) { if (Data.MultiGramHashes.Count <= i) { Data.MultiGramHashes.Add(new HashSet <ulong>()); } Data.MultiGramHashes[i].Add(hashes[i]); } } } if (trainingData is object) { trainingData.HashCount = new Dictionary <ulong, int>(HashCount); trainingData.Senses = new Dictionary <ulong, ulong[]>(Senses); trainingData.Words = new Dictionary <ulong, string>(Words); foreach (var word in trainingData.Words.Values) { AddToGazeteer(word); } } Logger.LogInformation("Finish training Word2Sense model"); }