Exemplo n.º 1
0
 public EntityType(string type, EntityTag tag, Dictionary <string, string> metadata, UID128 targetUID)
 {
     Type      = type;
     Tag       = tag;
     Metadata  = metadata;
     TargetUID = targetUID;
 }
Exemplo n.º 2
0
 public static CompilationStatus GetStatusOf(UID128 uid)
 {
     if (_perCompilationStatus.TryGetValue(uid, out var status))
     {
         return(status);
     }
     return(CompilationStatus.Pending);
 }
Exemplo n.º 3
0
        public async UnaryResult <CompilationResult> GetStatusAsync(UID128 compilationUID)
        {
            var messages = Logging.GetMessagesOf(compilationUID).ToArray();

            return(new CompilationResult()
            {
                Status = Logging.GetStatusOf(compilationUID),
                Messages = messages
            });
        }
Exemplo n.º 4
0
 public Document(Language language, string value, List <List <TokenData> > tokensData, List <int[]> spanBounds, Dictionary <string, string> metadata, UID128 uid, List <string> labels, Dictionary <long, List <EntityType> > entityData, Dictionary <long, Dictionary <string, string> > tokenMetadata)
 {
     Language      = language;
     Value         = value;
     TokensData    = tokensData;
     SpanBounds    = spanBounds;
     Metadata      = (metadata is null || metadata.Count == 0) ? null : metadata;
     UID           = uid;
     Labels        = labels;
     EntityData    = entityData;
     TokenMetadata = (tokenMetadata is null || tokenMetadata.Count == 0) ? null : tokenMetadata;
 }
Exemplo n.º 5
0
        public static IEnumerable <LogMessage> GetMessagesOf(UID128 uid)
        {
            if (!_perCompilation.TryGetValue(uid, out var queue))
            {
                yield break;
            }

            while (queue.TryDequeue(out var message))
            {
                yield return(message);
            }
        }
 public ImmutableDocument(Language language, string value, TokenData[][] tokensData, long[] spanBounds, Dictionary <string, string> metadata, UID128 uID, string[] labels, Dictionary <long, EntityType[]> entityData, Dictionary <long, Dictionary <string, string> > tokenMetadata)
 {
     Language      = language;
     Value         = value;
     TokensData    = tokensData;
     SpanBounds    = spanBounds;
     Metadata      = metadata;
     UID           = uID;
     Labels        = labels;
     EntityData    = entityData;
     TokenMetadata = tokenMetadata;
 }
Exemplo n.º 7
0
 public async Task <CompilationResult> GetStatusAsync(UID128 compilationUID, CancellationToken cancellationToken)
 {
     return(await _client.WithDeadline(DateTime.UtcNow.Add(_timeout)).WithCancellationToken(cancellationToken).GetStatusAsync(compilationUID));
 }
Exemplo n.º 8
0
        private static async Task Main()
        {
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
            {
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                {
                    UID = post.Title.Hash128()
                };
                pipeline.ProcessSingle(document);
                return(Post: post, Document: document);
            })
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
            fastText.Train(
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")
                );

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .GetDocumentVectors()
                          .Select(result =>
            {
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                    );
                return(UID: uid, result.Vector, postForResult.Post);
            })
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                DefaultRandomGenerator.Instance,
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }
                );

            graph.AddItems(results);

            // For every post, use the "KNNSearch" method on the graph to find the three most similar posts
            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
            {
                // Request one result too many from the KNNSearch call because it's expected that the original
                // post will come back as the best match and we'll want to exclude that
                var similarResults = graph
                                     .KNNSearch(result, maximumNumberOfResultsToReturn + 1)
                                     .Where(similarResult => similarResult.Item.UID != result.UID)
                                     .Take(maximumNumberOfResultsToReturn); // Just in case the original post wasn't included

                return(new
                {
                    result.Post,
                    Similar = similarResults
                              .Select(similarResult => new
                    {
                        similarResult.Item.Post,
                        similarResult.Distance
                    })
                              .ToArray()
                });
            })
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)
                                                       .ToArray();

            foreach (var postWithSimilarResults in postsWithSimilarResults)
            {
                Console.WriteLine();
                Console.WriteLine(postWithSimilarResults.Post.Title);
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                {
                    Console.WriteLine($"{similarResult.Distance:0.000} {similarResult.Post.Title}");
                }
            }

            Console.WriteLine();
            Console.WriteLine("Done! Press [Enter] to terminate..");
            Console.ReadLine();
        }
Exemplo n.º 9
0
        public static Document FromJObject(JObject jo)
        {
            var emptyEntityTypes = new List <EntityType>();

            var doc = new Document();

            doc.Language = Languages.CodeToEnum((string)jo[nameof(Language)]);
            doc.Value    = (string)jo[nameof(Value)];
            doc.UID      = UID128.TryParse((string)(jo[nameof(UID)]), out var uid) ? uid : default(UID128);

            var docmtd = jo[nameof(Metadata)];

            if (!(docmtd is null) && docmtd.HasValues)
            {
                doc.Metadata = new Dictionary <string, string>();
                foreach (JProperty md in docmtd)
                {
                    doc.Metadata.Add(md.Name, (string)md.Value);
                }
            }

            if (jo.ContainsKey(nameof(Labels)))
            {
                doc.Labels = jo[nameof(Labels)].Select(jt => (string)jt).ToList();
            }

            var td = jo[nameof(TokensData)];

            foreach (var sp in td)
            {
                var tokens = new List <(int begin, int end, PartOfSpeech tag, int head, float frequency, List <EntityType> entityType, IDictionary <string, string> metadata, string replacement)>();

                foreach (var tk in sp)
                {
                    var ets         = tk[nameof(EntityType)];
                    var entityTypes = emptyEntityTypes;
                    if (!(ets is null) && ets.HasValues)
                    {
                        entityTypes = new List <EntityType>();
                        foreach (var et in ets)
                        {
                            Dictionary <string, string> entityMetadata = null;
                            var etmtd = et[nameof(Metadata)];
                            if (!(etmtd is null) && etmtd.HasValues)
                            {
                                entityMetadata = new Dictionary <string, string>();
                                foreach (JProperty md in etmtd)
                                {
                                    entityMetadata.Add(md.Name, (string)md.Value);
                                }
                            }

                            entityTypes.Add(new EntityType((string)(et[nameof(EntityType.Type)]),
                                                           (EntityTag)Enum.Parse(typeof(EntityTag), (string)(et[nameof(EntityType.Tag)])),
                                                           entityMetadata,
                                                           UID128.TryParse((string)(et[nameof(EntityType.TargetUID)]), out var uid2) ? uid2 : default(UID128)));
                        }
                    }

                    IDictionary <string, string> metadata = null;

                    var mtd = tk[nameof(Metadata)];
                    if (!(mtd is null) && mtd.HasValues)
                    {
                        metadata = new Dictionary <string, string>();
                        foreach (JProperty md in mtd)
                        {
                            metadata.Add(md.Name, (string)md.Value);
                        }
                    }

                    tokens.Add((((int)(tk[nameof(TokenData.Bounds)][0])),
                                ((int)(tk[nameof(TokenData.Bounds)][1])),
                                (PartOfSpeech)Enum.Parse(typeof(PartOfSpeech), (string)(tk[nameof(TokenData.Tag)] ?? nameof(PartOfSpeech.NONE))),
                                ((int)(tk[nameof(TokenData.Head)] ?? "-1")),
                                (((float)(tk[nameof(TokenData.Frequency)] ?? 0f))),
                                entityTypes,
                                metadata,
                                (string)tk[nameof(TokenData.Replacement)]));
                }

                if (tokens.Any())
                {
                    var span = doc.AddSpan(tokens.First().begin, tokens.Last().end);

                    foreach (var tk in tokens)
                    {
                        var token = span.AddToken(tk.begin, tk.end);
                        token.POS       = tk.tag;
                        token.Head      = tk.head;
                        token.Frequency = tk.frequency;
                        foreach (var et in tk.entityType)
                        {
                            token.AddEntityType(et);
                        }

                        if (tk.metadata is object)
                        {
                            foreach (var kv in tk.metadata)
                            {
                                token.Metadata.Add(kv.Key, kv.Value);
                            }
                        }
                    }
                }
            }

            return(doc);
        }
Exemplo n.º 10
0
        public void AddEntry(string entry, UID128 uid)
        {
            void AddSingleTokenConcept(ulong entryHash)
            {
                Data.Hashes[entryHash] = uid;
            }

            if (string.IsNullOrWhiteSpace(entry))
            {
                return;
            }


            if (Data.IgnoreOnlyNumeric && int.TryParse(entry, out _))
            {
                return;
            }                                                                     //Ignore pure numerical entries

            var words = entry.Trim().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

            if (words.Length == 1)
            {
                var hash = Data.IgnoreCase ? Spotter.IgnoreCaseHash64(words[0].AsSpan()) : Spotter.Hash64(words[0].AsSpan());
                AddSingleTokenConcept(hash);

                if (!words[0].AsSpan().IsLetter())
                {
                    Data.TokenizerExceptions[words[0].CaseSensitiveHash32()] = new TokenizationException(null); //Null means don't replace by anything - keep token as is
                }

                return;
            }

            ulong combinedHash = 0;

            for (int n = 0; n < words.Length; n++)
            {
                var word_hash = Data.IgnoreCase ? Spotter.IgnoreCaseHash64(words[n].AsSpan()) : Spotter.Hash64(words[n].AsSpan());
                if (n == 0)
                {
                    combinedHash = word_hash;
                }
                else
                {
                    combinedHash = Spotter.HashCombine64(combinedHash, word_hash);
                }
                if (Data.MultiGramHashes.Count < n + 1)
                {
                    Data.MultiGramHashes.Add(new HashSet <ulong>());
                }

                if (!Data.MultiGramHashes[n].Contains(word_hash))
                {
                    Data.MultiGramHashes[n].Add(word_hash);
                }

                if (!words[n].AsSpan().IsLetter())
                {
                    Data.TokenizerExceptions[words[n].CaseSensitiveHash32()] = new TokenizationException(null); //Null means don't replace by anything - keep token as is
                }
            }

            AddSingleTokenConcept(combinedHash);
        }
Exemplo n.º 11
0
        public bool RecognizeEntities(ISpan ispan, bool stopOnFirstFound = false)
        {
            var  tokens       = ispan.ToTokenSpan();
            int  N            = tokens.Length;
            bool hasMultiGram = Data.MultiGramHashes.Any();
            bool foundAny     = false;

            for (int i = 0; i < N; i++)
            {
                var tk = tokens[i];
                //if (tk.POS != PartOfSpeechEnum.NOUN && tk.POS != PartOfSpeechEnum.ADJ && tk.POS != PartOfSpeechEnum.PROPN) { continue; }

                var tokenHash = Data.IgnoreCase ? IgnoreCaseHash64(tk.ValueAsSpan) : Hash64(tk.ValueAsSpan);

                if (hasMultiGram && Data.MultiGramHashes[0].Contains(tokenHash))
                {
                    int    window = Math.Min(N - i, Data.MultiGramHashes.Count);
                    ulong  hash   = tokenHash;
                    bool   someTokenHasReplacements = tk.Replacement is object;
                    int    i_final   = i;
                    UID128 uid_final = default;

                    for (int n = 1; n < window; n++)
                    {
                        var next = tokens[n + i];
                        someTokenHasReplacements |= (next.Replacement is object);

                        var nextHash = Data.IgnoreCase ? IgnoreCaseHash64(next.ValueAsSpan) : Hash64(next.ValueAsSpan);
                        if (Data.MultiGramHashes[n].Contains(nextHash))
                        {
                            //txt += " " + next.Value;
                            //var hashTxt = Hash64(txt);
                            hash = HashCombine64(hash, nextHash);
                            if (Data.Hashes.TryGetValue(hash, out var uid_multi))
                            {
                                i_final   = i + n;
                                uid_final = uid_multi;
                            }
                        }
                        else
                        {
                            break;
                        }
                    }

                    if (i_final > i)
                    {
                        foundAny = true;
                        if (stopOnFirstFound)
                        {
                            return(foundAny);
                        }                                          //Used for checking if the document contains any entity
                        tk.AddEntityType(new EntityType(CaptureTag, EntityTag.Begin, uid_final));
                        tokens[i_final].AddEntityType(new EntityType(CaptureTag, EntityTag.End, uid_final));

                        for (int m = i + 1; m < (i_final); m++)
                        {
                            tokens[m].AddEntityType(new EntityType(CaptureTag, EntityTag.Inside, uid_final));
                        }
                    }

                    i = i_final;
                }

                if (Data.Hashes.TryGetValue(tokenHash, out var uid))
                {
                    foundAny = true;
                    if (stopOnFirstFound)
                    {
                        return(foundAny);
                    }                                          //Used for checking if the document contains any entity
                    tk.AddEntityType(new EntityType(CaptureTag, EntityTag.Single, uid));
                }
            }
            return(foundAny);
        }
Exemplo n.º 12
0
 public async UnaryResult <Nil> AbortAsync(UID128 compilationUID)
 {
     Logger.ZLogInformation("==== ABORT {0}", compilationUID);
     CompilationProcessor.Abort(compilationUID);
     return(Nil.Default);
 }
Exemplo n.º 13
0
        private static void WriteEntry(IZLoggerEntry entry)
        {
            var boxedBuilder = (IBufferWriter <byte>) new Utf8ValueStringBuilder(false);

            try
            {
                entry.FormatUtf8(boxedBuilder, options, null);

                var message = boxedBuilder.ToString();

                var marker = message.IndexOf("====");

                if (marker >= 0)
                {
                    var content = message.Substring(marker + 5);
                    if (content.StartsWith("RCV"))
                    {
                        var uid = UID128.Parse(content.Substring(4, 22));
                        _perCompilationStatus[uid] = CompilationStatus.Pending;
                    }
                    else if (content.StartsWith("BEGIN"))
                    {
                        _currentUID = UID128.Parse(content.Substring(6, 22));
                        _perCompilation[_currentUID]       = new ConcurrentQueue <LogMessage>();
                        _perCompilationStatus[_currentUID] = CompilationStatus.OnGoing;
                    }
                    else if (content.StartsWith("SUCCESS"))
                    {
                        _perCompilationStatus[_currentUID] = CompilationStatus.Success;
                        _currentUID = default;
                    }
                    else if (content.StartsWith("FAIL"))
                    {
                        _perCompilationStatus[_currentUID] = CompilationStatus.Fail;
                        _currentUID = default;
                    }
                    else if (content.StartsWith("ABORT"))
                    {
                        _perCompilationStatus[_currentUID] = CompilationStatus.Fail;
                        _currentUID = default;
                    }
                    else if (content.StartsWith("CANCELED"))
                    {
                        _perCompilationStatus[_currentUID] = CompilationStatus.Fail;
                        _currentUID = default;
                    }
                }
                else
                {
                    if (_currentUID.IsNotNull())
                    {
                        _perCompilation[_currentUID].Enqueue(new LogMessage()
                        {
                            Message   = message,
                            Timestamp = entry.LogInfo.Timestamp.ToUnixTimeMilliseconds(),
                            LogLevel  = entry.LogInfo.LogLevel
                        });
                    }
                }
            }
            finally
            {
            }
        }
Exemplo n.º 14
0
 public EntityType(string type, EntityTag tag, UID128 targetUID)
 {
     Type = type; Tag = tag; Metadata = null; TargetUID = targetUID;
 }
Exemplo n.º 15
0
 public EntityType(string type, EntityTag tag)
 {
     Type = type; Tag = tag; Metadata = null; TargetUID = default(UID128);
 }
Exemplo n.º 16
0
 public async Task AbortAsync(UID128 compilationUID, CancellationToken cancellationToken)
 {
     await _client.WithDeadline(DateTime.UtcNow.Add(_timeout)).WithCancellationToken(cancellationToken).AbortAsync(compilationUID);
 }
Exemplo n.º 17
0
        private static async Task Main()
        {
            Console.WriteLine("Reading posts from GitHub repo..");
            var posts = await GetBlogPosts();

            Console.WriteLine("Parsing documents..");
            Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models"));
            var language           = Language.English;
            var pipeline           = Pipeline.For(language);
            var postsWithDocuments = posts
                                     .Select(post =>
            {
                var document = new Document(NormaliseSomeCommonTerms(post.PlainTextContent), language)
                {
                    UID = post.Title.Hash128()
                };
                pipeline.ProcessSingle(document);
                return(Post: post, Document: document);
            })
                                     .ToArray(); // Call ToArray to force evaluation of the document processing now

            Console.WriteLine("Training FastText model..");
            var fastText = new FastText(language, version: 0, tag: "");

            fastText.Data.Type                  = FastText.ModelType.PVDM;
            fastText.Data.Loss                  = FastText.LossType.NegativeSampling;
            fastText.Data.IgnoreCase            = true;
            fastText.Data.Epoch                 = 50;
            fastText.Data.Dimensions            = 512;
            fastText.Data.MinimumCount          = 1;
            fastText.Data.ContextWindow         = 10;
            fastText.Data.NegativeSamplingCount = 20;
            fastText.Train(
                postsWithDocuments.Select(postsWithDocument => postsWithDocument.Document),
                trainingStatus: update => Console.WriteLine($" Progress: {update.Progress}, Epoch: {update.Epoch}")
                );

            Console.WriteLine("Training TF-IDF model..");
            var tfidf = new TFIDF(pipeline.Language, version: 0, tag: "");
            await tfidf.Train(postsWithDocuments.Select(postWithDocument => postWithDocument.Document));

            Console.WriteLine("Getting average TF-IDF weights per word..");
            var tokenValueTFIDF = new Dictionary <string, List <float> >(StringComparer.OrdinalIgnoreCase);

            foreach (var doc in postsWithDocuments.Select(postWithDocument => postWithDocument.Document))
            {
                tfidf.Process(doc);
                foreach (var sentence in doc)
                {
                    foreach (var token in sentence)
                    {
                        if (!tokenValueTFIDF.TryGetValue(token.Value, out var freqs))
                        {
                            freqs = new();
                            tokenValueTFIDF.Add(token.Value, freqs);
                        }
                        freqs.Add(token.Frequency);
                    }
                }
            }
            var averagedTokenValueTFIDF = tokenValueTFIDF.ToDictionary(
                entry => entry.Key,
                entry => entry.Value.Average(), StringComparer.OrdinalIgnoreCase
                );

            Console.WriteLine("Building recommendations..");

            // Combine the blog post data with the FastText-generated vectors
            var results = fastText
                          .GetDocumentVectors()
                          .Select(result =>
            {
                // Each document vector instance will include a "token" string that may be mapped back to the
                // UID of the document for each blog post. If there were a large number of posts to deal with
                // then a dictionary to match UIDs to blog posts would be sensible for performance but I only
                // have a 100+ and so a LINQ "First" scan over the list will suffice.
                var uid           = UID128.Parse(result.Token);
                var postForResult = postsWithDocuments.First(
                    postWithDocument => postWithDocument.Document.UID == uid
                    );
                return(UID: uid, result.Vector, postForResult.Post);
            })
                          .ToArray(); // ToArray since we enumerate multiple times below

            // Construct a graph to search over, as described at
            // https://github.com/curiosity-ai/hnsw-sharp#how-to-build-a-graph
            var graph = new SmallWorld <(UID128 UID, float[] Vector, BlogPost Post), float>(
                distance: (to, from) => CosineDistance.NonOptimized(from.Vector, to.Vector),
                DefaultRandomGenerator.Instance,
                new() { M = 15, LevelLambda = 1 / Math.Log(15) }
                );

            graph.AddItems(results);

            const int maximumNumberOfResultsToReturn = 3;
            var       postsWithSimilarResults        = results
                                                       .Select(result =>
            {
                // Request that the KNNSearch operate over all documents because we can't take the top {n}
                // until we've combined the ordering with the title TFIDF proximity values
                var similarResults = graph
                                     .KNNSearch(result, postsWithDocuments.Length)
                                     .Where(similarResult => similarResult.Item.UID != result.UID);

                var tokenValuesInTitle =
                    GetAllTokensForText(NormaliseSomeCommonTerms(result.Post.Title), pipeline)
                    .Select(token => token.Value)
                    .ToHashSet(StringComparer.OrdinalIgnoreCase);

                return(new
                {
                    result.Post,
                    Similar = similarResults
                              .Select(similarResult => new
                    {
                        similarResult.Item.Post,
                        similarResult.Distance,
                        ProximityByTitleTFIDF = GetProximityByTitleTFIDF(
                            NormaliseSomeCommonTerms(similarResult.Item.Post.Title),
                            tokenValuesInTitle,
                            averagedTokenValueTFIDF,
                            pipeline
                            )
                    })
                              .OrderByDescending(similarResult => similarResult.ProximityByTitleTFIDF)
                              .ThenBy(similarResult => similarResult.Distance)
                              .Take(maximumNumberOfResultsToReturn)
                              .ToArray()
                });
            })
                                                       .OrderBy(result => result.Post.Title, StringComparer.OrdinalIgnoreCase)
                                                       .ToArray();

            foreach (var postWithSimilarResults in postsWithSimilarResults)
            {
                Console.WriteLine();
                Console.WriteLine(postWithSimilarResults.Post.Title);
                foreach (var similarResult in postWithSimilarResults.Similar.OrderBy(other => other.Distance))
                {
                    Console.WriteLine($"{similarResult.ProximityByTitleTFIDF:0.000} {similarResult.Distance:0.000} {similarResult.Post.Title}");
                }
            }

            Console.WriteLine();
            Console.WriteLine("Done! Press [Enter] to terminate..");
            Console.ReadLine();
        }
Exemplo n.º 18
0
 public FileStream GetTempStream()
 {
     return(new FileStream(Path.Combine(TempPath, UID128.New().ToString()), FileMode.CreateNew, FileAccess.ReadWrite, FileShare.None, 30_000, FileOptions.DeleteOnClose));
 }