public EntityType(string type, EntityTag tag, Dictionary <string, string> metadata, UID128 targetUID) { Type = type; Tag = tag; Metadata = metadata; TargetUID = targetUID; }
public static CompilationStatus GetStatusOf(UID128 uid) { if (_perCompilationStatus.TryGetValue(uid, out var status)) { return(status); } return(CompilationStatus.Pending); }
public async UnaryResult <CompilationResult> GetStatusAsync(UID128 compilationUID) { var messages = Logging.GetMessagesOf(compilationUID).ToArray(); return(new CompilationResult() { Status = Logging.GetStatusOf(compilationUID), Messages = messages }); }
public Document(Language language, string value, List <List <TokenData> > tokensData, List <int[]> spanBounds, Dictionary <string, string> metadata, UID128 uid, List <string> labels, Dictionary <long, List <EntityType> > entityData, Dictionary <long, Dictionary <string, string> > tokenMetadata) { Language = language; Value = value; TokensData = tokensData; SpanBounds = spanBounds; Metadata = (metadata is null || metadata.Count == 0) ? null : metadata; UID = uid; Labels = labels; EntityData = entityData; TokenMetadata = (tokenMetadata is null || tokenMetadata.Count == 0) ? null : tokenMetadata; }
public static IEnumerable <LogMessage> GetMessagesOf(UID128 uid) { if (!_perCompilation.TryGetValue(uid, out var queue)) { yield break; } while (queue.TryDequeue(out var message)) { yield return(message); } }
public ImmutableDocument(Language language, string value, TokenData[][] tokensData, long[] spanBounds, Dictionary <string, string> metadata, UID128 uID, string[] labels, Dictionary <long, EntityType[]> entityData, Dictionary <long, Dictionary <string, string> > tokenMetadata) { Language = language; Value = value; TokensData = tokensData; SpanBounds = spanBounds; Metadata = metadata; UID = uID; Labels = labels; EntityData = entityData; TokenMetadata = tokenMetadata; }
public async Task <CompilationResult> GetStatusAsync(UID128 compilationUID, CancellationToken cancellationToken) { return(await _client.WithDeadline(DateTime.UtcNow.Add(_timeout)).WithCancellationToken(cancellationToken).GetStatusAsync(compilationUID)); }
private static async Task Main() { Console.WriteLine("Reading posts from GitHub repo.."); var posts = await GetBlogPosts(); Console.WriteLine("Parsing documents.."); Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var language = Language.English; var pipeline = Pipeline.For(language); var postsWithDocuments = posts .Select(post => { var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language) { UID = post.Title.Hash128() }; pipeline.ProcessSingle(document); return(Post: post, Document: document); }) .ToArray(); // Call ToArray to force evaluation of the document processing now Console.WriteLine("Training FastText model.."); var fastText = new FastText(language, version: 0, tag: ""); fastText.Data.Type = FastText.ModelType.PVDM; fastText.Data.Loss = FastText.LossType.NegativeSampling; fastText.Data.IgnoreCase = true; fastText.Data.Epoch = 50; fastText.Data.Dimensions = 512; fastText.Data.MinimumCount = 1; fastText.Data.ContextWindow = 10; fastText.Data.NegativeSamplingCount = 20; fastText.Train( postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document), trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}") ); Console.WriteLine("Building recommendations.."); // Combine the blog post data with the FastText-generated vectors var results = fastText .GetDocumentVectors() .Select(result => { // Each document vector instance will include a "token" string that may be mapped back to the // UID of the document for each blog post. If there were a large number of posts to deal with // then a dictionary to match UIDs to blog posts would be sensible for performance but I only // have a 100+ and so a LINQ "First" scan over the list will suffice. var uid = UID128.Parse(result.Token); var postForResult = postsWithDocuments.First( postWithDocument => postWithDocument.Document.UID == uid ); return(UID: uid, result.Vector, postForResult.Post); }) .ToArray(); // ToArray since we enumerate multiple times below // Construct a graph to search over, as described at // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>( distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector), DefaultRandomGenerator.Instance, new() { M = 15, LevelLambda = 1 / Math.Log(15) } ); graph.AddItems(results); // For every post, use the "KNNSearch" method on the graph to find the three most similar posts const int maximumNumberOfResultsToReturn = 3; var postsWithSimilarResults = results .Select(result => { // Request one result too many from the KNNSearch call because it's expected that the original // post will come back as the best match and we'll want to exclude that var similarResults = graph .KNNSearch(result, maximumNumberOfResultsToReturn + 1) .Where(similarResult => similarResult.Item.UID != result.UID) .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included return(new { result.Post, Similar = similarResults .Select(similarResult => new { similarResult.Item.Post, similarResult.Distance }) .ToArray() }); }) .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase) .ToArray(); foreach (var postWithSimilarResults in postsWithSimilarResults) { Console.WriteLine(); Console.WriteLine(postWithSimilarResults.Post.Title); foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance)) { Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}"); } } Console.WriteLine(); Console.WriteLine("Done! Press [Enter] to terminate.."); Console.ReadLine(); }
public static Document FromJObject(JObject jo) { var emptyEntityTypes = new List <EntityType>(); var doc = new Document(); doc.Language = Languages.CodeToEnum((string)jo[nameof(Language)]); doc.Value = (string)jo[nameof(Value)]; doc.UID = UID128.TryParse((string)(jo[nameof(UID)]), out var uid) ? uid : default(UID128); var docmtd = jo[nameof(Metadata)]; if (!(docmtd is null) && docmtd.HasValues) { doc.Metadata = new Dictionary <string, string>(); foreach (JProperty md in docmtd) { doc.Metadata.Add(md.Name, (string)md.Value); } } if (jo.ContainsKey(nameof(Labels))) { doc.Labels = jo[nameof(Labels)].Select(jt => (string)jt).ToList(); } var td = jo[nameof(TokensData)]; foreach (var sp in td) { var tokens = new List <(int begin, int end, PartOfSpeech tag, int head, float frequency, List <EntityType> entityType, IDictionary <string, string> metadata, string replacement)>(); foreach (var tk in sp) { var ets = tk[nameof(EntityType)]; var entityTypes = emptyEntityTypes; if (!(ets is null) && ets.HasValues) { entityTypes = new List <EntityType>(); foreach (var et in ets) { Dictionary <string, string> entityMetadata = null; var etmtd = et[nameof(Metadata)]; if (!(etmtd is null) && etmtd.HasValues) { entityMetadata = new Dictionary <string, string>(); foreach (JProperty md in etmtd) { entityMetadata.Add(md.Name, (string)md.Value); } } entityTypes.Add(new EntityType((string)(et[nameof(EntityType.Type)]), (EntityTag)Enum.Parse(typeof(EntityTag), (string)(et[nameof(EntityType.Tag)])), entityMetadata, UID128.TryParse((string)(et[nameof(EntityType.TargetUID)]), out var uid2) ? uid2 : default(UID128))); } } IDictionary <string, string> metadata = null; var mtd = tk[nameof(Metadata)]; if (!(mtd is null) && mtd.HasValues) { metadata = new Dictionary <string, string>(); foreach (JProperty md in mtd) { metadata.Add(md.Name, (string)md.Value); } } tokens.Add((((int)(tk[nameof(TokenData.Bounds)][0])), ((int)(tk[nameof(TokenData.Bounds)][1])), (PartOfSpeech)Enum.Parse(typeof(PartOfSpeech), (string)(tk[nameof(TokenData.Tag)] ?? nameof(PartOfSpeech.NONE))), ((int)(tk[nameof(TokenData.Head)] ?? "-1")), (((float)(tk[nameof(TokenData.Frequency)] ?? 0f))), entityTypes, metadata, (string)tk[nameof(TokenData.Replacement)])); } if (tokens.Any()) { var span = doc.AddSpan(tokens.First().begin, tokens.Last().end); foreach (var tk in tokens) { var token = span.AddToken(tk.begin, tk.end); token.POS = tk.tag; token.Head = tk.head; token.Frequency = tk.frequency; foreach (var et in tk.entityType) { token.AddEntityType(et); } if (tk.metadata is object) { foreach (var kv in tk.metadata) { token.Metadata.Add(kv.Key, kv.Value); } } } } } return(doc); }
public void AddEntry(string entry, UID128 uid) { void AddSingleTokenConcept(ulong entryHash) { Data.Hashes[entryHash] = uid; } if (string.IsNullOrWhiteSpace(entry)) { return; } if (Data.IgnoreOnlyNumeric && int.TryParse(entry, out _)) { return; } //Ignore pure numerical entries var words = entry.Trim().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (words.Length == 1) { var hash = Data.IgnoreCase ? Spotter.IgnoreCaseHash64(words[0].AsSpan()) : Spotter.Hash64(words[0].AsSpan()); AddSingleTokenConcept(hash); if (!words[0].AsSpan().IsLetter()) { Data.TokenizerExceptions[words[0].CaseSensitiveHash32()] = new TokenizationException(null); //Null means don't replace by anything - keep token as is } return; } ulong combinedHash = 0; for (int n = 0; n < words.Length; n++) { var word_hash = Data.IgnoreCase ? Spotter.IgnoreCaseHash64(words[n].AsSpan()) : Spotter.Hash64(words[n].AsSpan()); if (n == 0) { combinedHash = word_hash; } else { combinedHash = Spotter.HashCombine64(combinedHash, word_hash); } if (Data.MultiGramHashes.Count < n + 1) { Data.MultiGramHashes.Add(new HashSet <ulong>()); } if (!Data.MultiGramHashes[n].Contains(word_hash)) { Data.MultiGramHashes[n].Add(word_hash); } if (!words[n].AsSpan().IsLetter()) { Data.TokenizerExceptions[words[n].CaseSensitiveHash32()] = new TokenizationException(null); //Null means don't replace by anything - keep token as is } } AddSingleTokenConcept(combinedHash); }
public bool RecognizeEntities(ISpan ispan, bool stopOnFirstFound = false) { var tokens = ispan.ToTokenSpan(); int N = tokens.Length; bool hasMultiGram = Data.MultiGramHashes.Any(); bool foundAny = false; for (int i = 0; i < N; i++) { var tk = tokens[i]; //if (tk.POS != PartOfSpeechEnum.NOUN && tk.POS != PartOfSpeechEnum.ADJ && tk.POS != PartOfSpeechEnum.PROPN) { continue; } var tokenHash = Data.IgnoreCase ? IgnoreCaseHash64(tk.ValueAsSpan) : Hash64(tk.ValueAsSpan); if (hasMultiGram && Data.MultiGramHashes[0].Contains(tokenHash)) { int window = Math.Min(N - i, Data.MultiGramHashes.Count); ulong hash = tokenHash; bool someTokenHasReplacements = tk.Replacement is object; int i_final = i; UID128 uid_final = default; for (int n = 1; n < window; n++) { var next = tokens[n + i]; someTokenHasReplacements |= (next.Replacement is object); var nextHash = Data.IgnoreCase ? IgnoreCaseHash64(next.ValueAsSpan) : Hash64(next.ValueAsSpan); if (Data.MultiGramHashes[n].Contains(nextHash)) { //txt += " " + next.Value; //var hashTxt = Hash64(txt); hash = HashCombine64(hash, nextHash); if (Data.Hashes.TryGetValue(hash, out var uid_multi)) { i_final = i + n; uid_final = uid_multi; } } else { break; } } if (i_final > i) { foundAny = true; if (stopOnFirstFound) { return(foundAny); } //Used for checking if the document contains any entity tk.AddEntityType(new EntityType(CaptureTag, EntityTag.Begin, uid_final)); tokens[i_final].AddEntityType(new EntityType(CaptureTag, EntityTag.End, uid_final)); for (int m = i + 1; m < (i_final); m++) { tokens[m].AddEntityType(new EntityType(CaptureTag, EntityTag.Inside, uid_final)); } } i = i_final; } if (Data.Hashes.TryGetValue(tokenHash, out var uid)) { foundAny = true; if (stopOnFirstFound) { return(foundAny); } //Used for checking if the document contains any entity tk.AddEntityType(new EntityType(CaptureTag, EntityTag.Single, uid)); } } return(foundAny); }
public async UnaryResult <Nil> AbortAsync(UID128 compilationUID) { Logger.ZLogInformation("==== ABORT {0}", compilationUID); CompilationProcessor.Abort(compilationUID); return(Nil.Default); }
private static void WriteEntry(IZLoggerEntry entry) { var boxedBuilder = (IBufferWriter <byte>) new Utf8ValueStringBuilder(false); try { entry.FormatUtf8(boxedBuilder, options, null); var message = boxedBuilder.ToString(); var marker = message.IndexOf("===="); if (marker >= 0) { var content = message.Substring(marker + 5); if (content.StartsWith("RCV")) { var uid = UID128.Parse(content.Substring(4, 22)); _perCompilationStatus[uid] = CompilationStatus.Pending; } else if (content.StartsWith("BEGIN")) { _currentUID = UID128.Parse(content.Substring(6, 22)); _perCompilation[_currentUID] = new ConcurrentQueue <LogMessage>(); _perCompilationStatus[_currentUID] = CompilationStatus.OnGoing; } else if (content.StartsWith("SUCCESS")) { _perCompilationStatus[_currentUID] = CompilationStatus.Success; _currentUID = default; } else if (content.StartsWith("FAIL")) { _perCompilationStatus[_currentUID] = CompilationStatus.Fail; _currentUID = default; } else if (content.StartsWith("ABORT")) { _perCompilationStatus[_currentUID] = CompilationStatus.Fail; _currentUID = default; } else if (content.StartsWith("CANCELED")) { _perCompilationStatus[_currentUID] = CompilationStatus.Fail; _currentUID = default; } } else { if (_currentUID.IsNotNull()) { _perCompilation[_currentUID].Enqueue(new LogMessage() { Message = message, Timestamp = entry.LogInfo.Timestamp.ToUnixTimeMilliseconds(), LogLevel = entry.LogInfo.LogLevel }); } } } finally { } }
public EntityType(string type, EntityTag tag, UID128 targetUID) { Type = type; Tag = tag; Metadata = null; TargetUID = targetUID; }
public EntityType(string type, EntityTag tag) { Type = type; Tag = tag; Metadata = null; TargetUID = default(UID128); }
public async Task AbortAsync(UID128 compilationUID, CancellationToken cancellationToken) { await _client.WithDeadline(DateTime.UtcNow.Add(_timeout)).WithCancellationToken(cancellationToken).AbortAsync(compilationUID); }
private static async Task Main() { Console.WriteLine("Reading posts from GitHub repo.."); var posts = await GetBlogPosts(); Console.WriteLine("Parsing documents.."); Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); var language = Language.English; var pipeline = Pipeline.For(language); var postsWithDocuments = posts .Select(post => { var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language) { UID = post.Title.Hash128() }; pipeline.ProcessSingle(document); return(Post: post, Document: document); }) .ToArray(); // Call ToArray to force evaluation of the document processing now Console.WriteLine("Training FastText model.."); var fastText = new FastText(language, version: 0, tag: ""); fastText.Data.Type = FastText.ModelType.PVDM; fastText.Data.Loss = FastText.LossType.NegativeSampling; fastText.Data.IgnoreCase = true; fastText.Data.Epoch = 50; fastText.Data.Dimensions = 512; fastText.Data.MinimumCount = 1; fastText.Data.ContextWindow = 10; fastText.Data.NegativeSamplingCount = 20; fastText.Train( postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document), trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}") ); Console.WriteLine("Training TF-IDF model.."); var tfidf = new TFIDF(pipeline.Language, version: 0, tag: ""); await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document)); Console.WriteLine("Getting average TF-IDF weights per word.."); var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase); foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document)) { tfidf.Process(doc); foreach (var sentence in doc) { foreach (var token in sentence) { if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs)) { freqs = new(); tokenValueTFIDF.Add(token.Value, freqs); } freqs.Add(token.Frequency); } } } var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary( entry => entry.Key, entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase ); Console.WriteLine("Building recommendations.."); // Combine the blog post data with the FastText-generated vectors var results = fastText .GetDocumentVectors() .Select(result => { // Each document vector instance will include a "token" string that may be mapped back to the // UID of the document for each blog post. If there were a large number of posts to deal with // then a dictionary to match UIDs to blog posts would be sensible for performance but I only // have a 100+ and so a LINQ "First" scan over the list will suffice. var uid = UID128.Parse(result.Token); var postForResult = postsWithDocuments.First( postWithDocument => postWithDocument.Document.UID == uid ); return(UID: uid, result.Vector, postForResult.Post); }) .ToArray(); // ToArray since we enumerate multiple times below // Construct a graph to search over, as described at // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>( distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector), DefaultRandomGenerator.Instance, new() { M = 15, LevelLambda = 1 / Math.Log(15) } ); graph.AddItems(results); const int maximumNumberOfResultsToReturn = 3; var postsWithSimilarResults = results .Select(result => { // Request that the KNNSearch operate over all documents because we can't take the top {n} // until we've combined the ordering with the title TFIDF proximity values var similarResults = graph .KNNSearch(result, postsWithDocuments.Length) .Where(similarResult => similarResult.Item.UID != result.UID); var tokenValuesInTitle = GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline) .Select(token => token.Value) .ToHashSet(StringComparer.OrdinalIgnoreCase); return(new { result.Post, Similar = similarResults .Select(similarResult => new { similarResult.Item.Post, similarResult.Distance, ProximityByTitleTFIDF = GetProximityByTitleTFIDF( NormaliseSomeCommonTerms(similarResult.Item.Post.Title), tokenValuesInTitle, averagedTokenValueTFIDF, pipeline ) }) .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF) .ThenBy(similarResult => similarResult.Distance) .Take(maximumNumberOfResultsToReturn) .ToArray() }); }) .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase) .ToArray(); foreach (var postWithSimilarResults in postsWithSimilarResults) { Console.WriteLine(); Console.WriteLine(postWithSimilarResults.Post.Title); foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance)) { Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}"); } } Console.WriteLine(); Console.WriteLine("Done! Press [Enter] to terminate.."); Console.ReadLine(); }
public FileStream GetTempStream() { return(new FileStream(Path.Combine(TempPath, UID128.New().ToString()), FileMode.CreateNew, FileAccess.ReadWrite, FileShare.None, 30_000, FileOptions.DeleteOnClose)); }