public void Test_Correct_TokenType() { string input = "everything (over 1.11 + 1)&& ;<"; var tokens = TextTokenizer.TextTokenize(input); Assert.Equal(TextTokenType.Word, tokens[0].Type); Assert.Equal(TextTokenType.ParentheseStart, tokens[1].Type); Assert.Equal(TextTokenType.Word, tokens[2].Type); Assert.Equal(TextTokenType.Number, tokens[3].Type); Assert.Equal(TextTokenType.Operator, tokens[4].Type); Assert.Equal(TextTokenType.Number, tokens[5].Type); Assert.Equal(TextTokenType.ParentheseStop, tokens[6].Type); Assert.Equal(TextTokenType.Operator, tokens[7].Type); Assert.Equal(TextTokenType.EndOfSentence, tokens[8].Type); Assert.Equal(TextTokenType.Operator, tokens[9].Type); }
private List <Document> preprocessDataset(String directoryUrl) { List <Document> dataset = new List <Document>(); string baseDirPath = Path.GetDirectoryName(Path.GetDirectoryName(System.IO.Directory.GetCurrentDirectory())); foreach (string file in Directory.EnumerateFiles(baseDirPath + @"\dataset", "*.json")) { string json = File.ReadAllText(file); List <DocItem> docItems = JsonConvert.DeserializeObject <List <DocItem> >(json); Document document; foreach (var item in docItems) { if (item.topics == null || item.topics.Length < 0) { continue; } //for each doc - tokenize its body and convert it into a Document object. document = TextTokenizer.tokenize(item.title + " " + item.body); document.categories = item.topics.ToList <String>(); dataset.Add(document); } } return(dataset); }
public void Test_Correct_TokenContent() { string input = "everything is over 1.11 + 1 && 1 < 1"; var tokens = TextTokenizer.TextTokenize(input); Assert.Equal("everything", new String(tokens[0].RawData.ToArray())); Assert.Equal("is", new String(tokens[1].RawData.ToArray())); Assert.Equal("over", new String(tokens[2].RawData.ToArray())); Assert.Equal("1.11", new String(tokens[3].RawData.ToArray())); Assert.Equal("+", new String(tokens[4].RawData.ToArray())); Assert.Equal("1", new String(tokens[5].RawData.ToArray())); Assert.Equal("&&", new String(tokens[6].RawData.ToArray())); Assert.Equal("1", new String(tokens[7].RawData.ToArray())); Assert.Equal("<", new String(tokens[8].RawData.ToArray())); Assert.Equal("1", new String(tokens[9].RawData.ToArray())); }
public async Task <IEnumerable <string> > Audios([FromBody] TextBody textBody) { var tokenizer = new TextTokenizer(); var tokens = tokenizer.GetWords(textBody.Input); var audios = new List <string>(); foreach (var token in tokens) { var composition = new Composition { Return = new ContentSegment { Url = $"https://forvo.com/word/{token.Value.RemoveAccents()}/#ru", Select = "span.play" } }; var elements = await composition.Return.DocumentElement(); foreach (var element in elements) { var onclick = element.GetAttribute("onclick"); var onclickParts = onclick.Split(','); if (onclickParts.Count() >= 5) { audios.Add(Encoding.UTF8.GetString(Convert.FromBase64String(onclickParts[4].Trim(new[] { '\'', '"' })))); } if (audios.Count >= 5) { break; } } } return(audios); }
/// <summary> /// Preprocesses the original dataset and converts it to a List of Documents. /// </summary> /// <param name="trainingDataset"> </param> /// <returns> </returns> private IList <Document> preprocessDataset(IDictionary <string, String[]> trainingDataset) { IList <Document> dataset = new List <Document>(); string category; string[] examples; Document doc; IEnumerator <KeyValuePair <string, String[]> > it = trainingDataset.GetEnumerator(); //loop through all the categories and training examples while (it.MoveNext()) { KeyValuePair <string, String[]> entry = it.Current; category = entry.Key; examples = entry.Value; for (int i = 0; i < examples.Length; ++i) { //for each example in the category tokenize its text and convert it into a Document object. doc = TextTokenizer.tokenize(examples[i]); doc.category = category; dataset.Add(doc); //examples[i] = null; //try freeing some memory } //it.remove(); //try freeing some memory } return(dataset); }
public List <String> predict(String text, int topKCategories = 3) { if (knowledgeBase == null) { throw new ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it."); } //Tokenizes the text and creates a new document Document doc = TextTokenizer.tokenize(text); double occurrences; //String maxScoreCategory = null; //Double maxScore = Double.MinValue; Dictionary <String, double> predictionScores = new Dictionary <string, double>(); foreach (var categoryCounts in knowledgeBase.logPriors) { double logprob = categoryCounts.Value; //foreach feature of the document foreach (var tokenCount in doc.tokens) { if (!knowledgeBase.logConditionalProbability.ContainsKey(tokenCount.Key)) { continue; //if the feature does not exist just skip it } occurrences = tokenCount.Value; //get its occurrences in text if (knowledgeBase.logConditionalProbability[tokenCount.Key].ContainsKey(categoryCounts.Key)) { logprob += knowledgeBase.logConditionalProbability[tokenCount.Key][categoryCounts.Key]; //multiply loglikelihood score with occurrences } } predictionScores.Add(categoryCounts.Key, logprob); //if (categoryCounts.Value > maxScore) //{ // maxScore = categoryCounts.Value; // maxScoreCategory = categoryCounts.Key; //} } var list = predictionScores.ToList(); list.Sort((pair1, pair2) => { return(pair2.Value.CompareTo(pair1.Value)); }); List <string> result = new List <string>(); foreach (var l in list) { if (l.Value > 0.0) { result.Add(l.Key); } } return(result.Count >= topKCategories?result.GetRange(0, topKCategories) : result); //return the categoies with positive odds }
public void TestSplitToWords() { TextTokenizer tokenizer = new TextTokenizer(); string text1 = "Я, ты, он, ,она - оно! Они: их 123, им?"; string[] expected1 = new string[] { "Я", "ты", "он", "она", "оно", "Они", "их", "им" }; string[] result1 = tokenizer.SplitToWords(text1); CollectionAssert.AreEqual(result1, expected1); }
public static QueryExpression Parse(string text) { using (var sr = new StringReader(text)) { var tokenizer = new TextTokenizer(TextTokenizer.QUERY_MODE); tokenizer.Run(sr); var parser = new Parser(); return(parser.Run(tokenizer.Texts)); } }
private static void AddTermsFromPlainText(ISet <string> target, string text) { if (string.IsNullOrEmpty(text)) { return; } var tokens = TextTokenizer.Tokenize(text); target.UnionWith(tokens); }
public void Order_Of_Adding_And_Multiplyging_Is_Correct2() { string input = "1+2*3;"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); Assert.Equal(ExpressionType.BinaryOperation, AST.Type); Assert.Equal(BinaryExpressionType.Add, ((BinaryExpressionNode)AST).BinaryType); }
/// <summary> /// Predicts the category of a text by using an already trained classifier /// and returns its category. /// </summary> /// <param name="text"> </param> /// <returns> </returns> /// <exception cref="IllegalArgumentException"> </exception> public virtual string predict(string text) { if (knowledgeBase == null) { throw new System.ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it."); } //Tokenizes the text and creates a new document Document doc = TextTokenizer.tokenize(text); string category; string feature; int occurrences; double?logprob; string maxScoreCategory = null; double?maxScore = double.NegativeInfinity; //Map<String, Double> predictionScores = new HashMap<>(); foreach (KeyValuePair <string, double> entry1 in knowledgeBase.logPriors) { category = entry1.Key; logprob = entry1.Value; //intialize the scores with the priors //foreach feature of the document foreach (KeyValuePair <string, int> entry2 in doc.tokens) { feature = entry2.Key; if (!knowledgeBase.logLikelihoods.ContainsKey(feature)) { continue; //if the feature does not exist in the knowledge base skip it } occurrences = entry2.Value; //get its occurrences in text logprob += occurrences * knowledgeBase.logLikelihoods[feature][category]; //multiply loglikelihood score with occurrences } //predictionScores.put(category, logprob); if (logprob > maxScore) { maxScore = logprob; maxScoreCategory = category; } } return(maxScoreCategory); //return the category with heighest score }
private static void Apply(IssueFilter result, BoundTextQuery expression) { var terms = TextTokenizer.Tokenize(expression.Text); foreach (var term in terms) { if (expression.IsNegated) { result.ExcludedTerms.Add(term); } else { result.IncludedTerms.Add(term); } } }
public void One_Plus_One_Is_Two() { string input = "1 + 1"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); global::BRE.ExecutionContext context = new global::BRE.ExecutionContext(); ExecutionEngine executionEngine = new ExecutionEngine(context); executionEngine.ExecuteExpression(AST); Assert.Equal(2.0, (double)AST.EvaluatedValue.Value); }
public void Basic_Boolean_Logic_Got_Correct_AST() { string input = "1 + 3 > 1 && 1 + 1 > 0"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); var BinaryAst = (BinaryExpressionNode)AST; var left = (BinaryExpressionNode)BinaryAst.Left; var right = (BinaryExpressionNode)BinaryAst.Right; Assert.Equal(BinaryExpressionType.GreaterThan, left.BinaryType); Assert.Equal(BinaryExpressionType.GreaterThan, right.BinaryType); }
public override string Summarize(string text, string lang = "en") { var sentences = SplitTextOnSentences(text); int summarySize = 3; if (sentences.Count <= summarySize) { return(text); } _stopWordFilter = _nlpServiceProvider.GetStopWordFilter(lang); var stemmer = _nlpServiceProvider.GetStemmer(lang); var tokenizedSentences = new List <IList <string> >(); foreach (var sentence in sentences) { var tokenizer = new TextTokenizer(sentence, filterMapper: new TextFilterMapper { Map = (t) => stemmer.Stem(t) }); tokenizedSentences.Add(tokenizer.ToList()); } var matrix = BuildSimilarityMatrix(tokenizedSentences); var graph = BuildDirectedGraph(matrix); var result = new PageRank() .Rank(graph) .OrderBy(kv => kv.Value); //Less value, better result var summary = ""; var topSentances = result.Take(summarySize).OrderBy(kv => kv.Key); //Sentences order in text foreach (var topSent in topSentances) { summary += sentences[topSent.Key] + ". "; } return(summary); }
public CrawledIndexCompletionProvider(CrawledIndex index) { _orgs = new SortedSet <string>( index.Repos.Select(r => r.Org), StringComparer.OrdinalIgnoreCase ).ToArray(); _repos = new SortedSet <string>( index.Repos.SelectMany(r => new[] { r.Name, r.FullName }), StringComparer.OrdinalIgnoreCase ).ToArray(); _users = new SortedSet <string>( index.Repos.SelectMany(r => r.Issues.Values) .SelectMany(i => new[] { i.CreatedBy }.Concat(i.Assignees)), StringComparer.OrdinalIgnoreCase ).ToArray(); _labels = new SortedSet <string>( index.Repos.SelectMany(r => r.Labels) .Select(l => l.Name), StringComparer.OrdinalIgnoreCase ).ToArray(); _milestones = new SortedSet <string>( index.Repos.SelectMany(r => r.Milestones) .Select(m => m.Title), StringComparer.OrdinalIgnoreCase ).ToArray(); _areaPaths = new SortedSet <string>( index.Repos.SelectMany(r => r.Labels) .SelectMany(l => TextTokenizer.GetAreaPaths(l.Name)), StringComparer.OrdinalIgnoreCase ).ToArray(); _areaNodes = new SortedSet <string>( index.Repos.SelectMany(r => r.Labels) .SelectMany(l => TextTokenizer.GetAreaPaths(l.Name, segmentsOnly: true)), StringComparer.OrdinalIgnoreCase ).ToArray(); }
public async Task <Translation> Translation([FromBody] TextBody textBody) { var translation = new Translation(); using (var service = new TranslateService(new BaseClientService.Initializer { ApiKey = "AIzaSyBiPTuvLggID2YrmBshuHBZhij6HeFOxko", ApplicationName = "Project Name" })) { var input = textBody.Input; if (input.StartsWith("en:")) { var russianResponse = await service.Translations.List(new string[] { input.Replace("en:", "") }, "ru").ExecuteAsync(); input = russianResponse.Translations[0].TranslatedText; } var tokenizer = new TextTokenizer(); var blocks = tokenizer.GetBlocks(input); var blockResponse = await service.Translations.List(blocks.Select(t => t.OriginalText).ToArray(), "en").ExecuteAsync(); for (int i = 0; i < blockResponse.Translations.Count; i++) { blocks[i].TranslatedText = blockResponse.Translations[i].TranslatedText; var words = tokenizer.GetWords(blocks[i].OriginalText); var wordResponse = await service.Translations.List(words.Select(t => t.Value).ToArray(), "en").ExecuteAsync(); for (int j = 0; j < wordResponse.Translations.Count; j++) { words[j].Translation = wordResponse.Translations[j].TranslatedText; blocks[i].Words.Add(words[j]); } translation.Blocks.Add(blocks[i]); } return(translation); } }
public void TestCase() { var text = "This is {test} \"te\\\"xt\"."; var tokenizer = new TextTokenizer(text, WordRule.Default, WhitespaceRule.Default, StringRule.Default, new SingleCharRule('{'), new SingleCharRule('}'), new SingleCharRule('.')); var tokens = tokenizer.ReadToEnd(); Assert.AreEqual(tokens [0].Rule, WordRule.Default); Assert.AreEqual(tokens [1].Rule, WhitespaceRule.Default); Assert.AreEqual(tokens [2].Rule.GetType(), typeof(WordRule)); Assert.AreEqual(tokens [3].Text, " "); Assert.AreEqual(tokens [4].Rule.GetType(), typeof(SingleCharRule)); Assert.AreEqual(tokens [5].Text, "test"); Assert.AreEqual(tokens [6].Text, "}"); Assert.AreEqual(tokens [7].Rule.GetType(), typeof(WhitespaceRule)); Assert.AreEqual(tokens [8].Rule, StringRule.Default); Assert.AreEqual(tokens [8].Text, "te\"xt"); }
public void Variables_Can_Be_Found_By_Name() { string input = "1+Apple"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); var BinaryAst = (BinaryExpressionNode)AST; var left = (ConstantExpression)BinaryAst.Left; var right = (VariableExpression)BinaryAst.Right; Assert.Equal(BinaryExpressionType.Add, BinaryAst.BinaryType); Assert.Equal(1.0, (double)left.Value.Value); Assert.Equal("Apple", right.VariableValue.VariableName); }
public void One_Plus_Two_Is_Greater_Than_Two() { string input = "1 + 2 > 2"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); global::BRE.ExecutionContext context = new global::BRE.ExecutionContext(); ExecutionEngine executionEngine = new ExecutionEngine(context); executionEngine.ExecuteExpression(AST); Assert.Equal(ValueType.Logical, AST.EvaluatedValue.Type); Assert.Equal(true, (bool)AST.EvaluatedValue.Value); }
public void Add_Operator_Has_Higher_Precedence_Than_Adding() { string input = "1 && 1 + 1"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); var BinaryAst = (BinaryExpressionNode)AST; var left = (ConstantExpression)BinaryAst.Left; var right = (BinaryExpressionNode)BinaryAst.Right; Assert.Equal(BinaryExpressionType.And, BinaryAst.BinaryType); Assert.Equal(1.0, (double)left.Value.Value); Assert.Equal(BinaryExpressionType.Add, right.BinaryType); }
public void Variable_Can_Be_Found_From_Context() { string input = "Word"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); global::BRE.ExecutionContext context = new global::BRE.ExecutionContext(); context.AddVariableValue("word", 2.313); ExecutionEngine executionEngine = new ExecutionEngine(context); executionEngine.ExecuteExpression(AST); Assert.Equal(ValueType.Number, AST.EvaluatedValue.Type); Assert.Equal(2.313, (double)AST.EvaluatedValue.Value); }
public void Parentheses_Affect_The_Order_Of_AST() { string input = "1*(2+2.2)+3;"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); var BinaryAst = (BinaryExpressionNode)AST; var left = (BinaryExpressionNode)BinaryAst.Left; var leftRight = (ExpressionNode)left.Right; var targetExpression = (BinaryExpressionNode)leftRight.Expression; Assert.Equal(BinaryExpressionType.Add, targetExpression.BinaryType); Assert.Equal(2.0, (double)targetExpression.Left.Value.Value); Assert.Equal(2.2, (double)targetExpression.Right.Value.Value); }
public void Ternary_Expression_Parsed_Correctly() { string input = " 1 == 2 ? 2 : 3"; var TextTokens = TextTokenizer.TextTokenize(input); Parser parsers = new Parser(TextTokens); var AST = parsers.ParseExpression(); var ternary = (TernaryExpressionNode)AST; var first = (BinaryExpressionNode)ternary.First; var second = (ConstantExpression)ternary.Second; var third = (ConstantExpression)ternary.Third; Assert.Equal(BinaryExpressionType.Equal, first.BinaryType); Assert.NotNull(second); Assert.NotNull(third); }
bool ReadNode(HtmlElement parentElement, out HtmlNode node, string endElementLocalName = null) { if (MoveNext()) { switch (Current.Type) { case HtmlTagType.Define: node = new HtmlDocumentType(Current.Attributes, parentElement); return(true); case HtmlTagType.Single: node = new HtmlElement(Current.Name, Current.Attributes, parentElement); return(true); case HtmlTagType.Start: { HtmlElement element = new HtmlElement(Current.Name, Current.Attributes, parentElement); node = element; if (HtmlAPI.IsContentTag(element.LocalName)) { TextTokenizer.SkipIgnoreCharacters(); string text = TextTokenizer.ReadTo($"</{element.LocalName}>").TrimEnd(); if (text.Length > 0) { node.ChildNodes.Add(new HtmlText(text, element)); } return(true); } Tags.Add(element.LocalName); HtmlNode lastNode = null; while (ReadNode(element, out HtmlNode childNode, element.LocalName)) { if (childNode is null) { continue; } if (lastNode != null) { lastNode.NextSibling = childNode; childNode.PreviousSibling = lastNode; } node.ChildNodes.Add(childNode); lastNode = childNode; } return(true); } case HtmlTagType.End: if (endElementLocalName != null && !HtmlAPI.Equals(Current.Name, endElementLocalName)) { if (Tags.Contains(Current.Name)) { Buffer.Push(Current); } else { node = null; return(true); } } if (Tags.Count > 0) { Tags.RemoveAt(Tags.Count - 1); } break; case HtmlTagType.Text: node = new HtmlText(HtmlAPI.Unescape(Current.Content), parentElement); return(true); case HtmlTagType.Comment: node = new HtmlComment(Current.Content, parentElement); return(true); default: throw new NotImplementedException(Current.Type.ToString()); } } node = null; return(false); }
public void TokenizerShouldConstruct() { ITextTokenizer textTokenizer = new TextTokenizer(new TextHelper()); }
private static IEnumerable<string> GetFileTokens(string fileName, bool ignoreCase) { string text = null; try { text = File.ReadAllText(fileName); } catch (IOException ex) { Debug.WriteLine(ex.Message + ex.StackTrace); } if (text != null) { var tokenizer = new TextTokenizer(text); var tokens = tokenizer.Tokenize(); var filteredWords = tokens.Select(token => ignoreCase ? token.ToLower() : token); foreach (var filteredWord in filteredWords) { yield return filteredWord; } } }
public object ParseValue(TextTokenizer tokenizer, ITokenRule endToken) { var required = endToken == null ? new [] { word, text, startObject, startArray } : new [] { word, text, startObject, startArray, endToken }; var token = tokenizer.NextToken(required: required); if (token.Rule == word) { var boolValue = false; var numberValue = 0.0d; if (bool.TryParse(token.Text, out boolValue)) { return(boolValue); } if (double.TryParse(token.Text, out numberValue)) { return(numberValue); } throw new NotSupportedException("not supported token"); } else if (token.Rule == text) { return(token.Text); } else if (token.Rule == startArray) { var list = new List <object> (); object val = null; while ((val = ParseValue(tokenizer, endArray)) != null) { list.Add(val); if (tokenizer.NextToken(required: new [] { endArray, comma }).Rule == endArray) { break; } } return(list.ToArray()); } else if (token.Rule == startObject) { var obj = new Dictionary <string, object> (); do { var prop = tokenizer.NextToken(new [] { word, endObject }); if (prop.Rule == endObject) { break; } tokenizer.NextToken(new [] { colon }); obj.Add(prop.Text, ParseValue(tokenizer, null)); token = tokenizer.NextToken(new [] { comma, endObject }); } while(token.Rule != endObject); return(obj); } return(null); }
public async Task <ImageAnalysisResult> AnalyzeImageAsync(string url) { var analysisResult = new ImageAnalysisResult(); try { // USING Microsoft provided VisionClientLibrary seems not working in NET Core as-is, a fix is required for ExpandoObject // see: https://github.com/Microsoft/Cognitive-Vision-DotNetCore/pull/1/commits/9c4647edb400aecd4def330537d5bcd74f126111 Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): initializing VisionAPI client"); var visionApiClient = new VisionServiceClient(m_VisionAPISubscriptionKey, "https://westeurope.api.cognitive.microsoft.com/vision/v1.0"); var visualFeatures = new List <VisualFeature> { VisualFeature.Adult, VisualFeature.Categories, VisualFeature.Color, VisualFeature.Description, VisualFeature.Faces, VisualFeature.ImageType /*, VisualFeature.Tags */ }; var details = new List <string> { "Celebrities", "Landmarks" }; Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): started image analysis"); var visionApiResult = await visionApiClient.AnalyzeImageAsync(url, visualFeatures, details).ConfigureAwait(false); Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): executing OCR"); var ocrResult = await visionApiClient.RecognizeTextAsync(url).ConfigureAwait(false); Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): performing tag identification"); var tagsResult = await visionApiClient.GetTagsAsync(url).ConfigureAwait(false); Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): analysis completed"); // Mapping VisionAPI Client entity to domain entity analysisResult.AdultContent = new ImageAnalysisAdultContentResult { AdultScore = visionApiResult.Adult.AdultScore, IsAdultContent = visionApiResult.Adult.IsAdultContent, IsRacyContent = visionApiResult.Adult.IsRacyContent, RacyScore = visionApiResult.Adult.RacyScore }; analysisResult.Colors = new ImageAnalysisColorResult { AccentColor = visionApiResult.Color.AccentColor, DominantColorBackground = visionApiResult.Color.DominantColorBackground, DominantColorForeground = visionApiResult.Color.DominantColorForeground, IsBWImg = visionApiResult.Color.IsBWImg }; analysisResult.Categories = visionApiResult.Categories.Select(c => new ImageAnalysisCategoryResult { Text = c.Name, Score = c.Score }).OrderByDescending(c => c.Score).ToList(); analysisResult.Descriptions = visionApiResult.Description.Captions.Select(c => new ImageAnalysisDescriptionResult { Text = c.Text, Score = c.Confidence }).OrderByDescending(c => c.Score).ToList(); // Merge detected tags from image analysis and image tags analysisResult.Tags = tagsResult.Tags.Select(t => new ImageAnalysisTagResult { Text = t.Name, Score = t.Confidence, Hint = t.Hint }).ToList(); foreach (var t in visionApiResult.Description.Tags) { analysisResult.Tags.Add(new ImageAnalysisTagResult { Text = t, Score = 0.0, Hint = string.Empty }); } analysisResult.Faces = visionApiResult.Faces.Select(f => new ImageAnalysisFaceResult { Age = f.Age, Gender = f.Gender == "Male" ? Gender.Male : f.Gender == "female" ? Gender.Female : Gender.Unknown }).ToList(); analysisResult.Text = ocrResult.Regions.Select(r => new ImageAnalysisTextResult() { Language = ocrResult.Language, Orientation = ocrResult.Orientation, TextAngle = ocrResult.TextAngle.GetValueOrDefault(), Text = string.Join(" ", r.Lines.Select(l => string.Join(" ", l.Words.Select(w => w.Text)))) }).ToList(); // Extend analysis by estimating reading time for each transcribed text foreach (var text in analysisResult.Text) { text.WordCount = TextTokenizer.GetWordCount(text.Text); text.ReadingTimeInMinutes = ReadingTimeEstimator.GetEstimatedReadingTime(text.WordCount, text.Language); analysisResult.WatchingTimeInMinutes += text.ReadingTimeInMinutes; } // Add an additional default time for estimating how long it will take to the user to watch the picture analysisResult.WatchingTimeInMinutes += DefaultImageWatchingTime; } catch (Exception ex) { Console.WriteLine($"\t\t\tContentAnalyzer.AnalyzeImageAsync(): an error occured while analyzing image - {ex.Message}"); } return(analysisResult); }
/// <summary> /// Performs clustering for the selected language using the current parameters. /// </summary> private void Cluster() { try { TernaryIndex = new TernaryIndex(ClusterVectorSize, ClusterVectorDensity); Parallel.ForEach(DocFiles, docFile => { string text; try { text = File.ReadAllText(Path.Combine(CommonFiles.DocsPath(LanguageId), docFile)); } catch (IOException ex) { Debug.WriteLine(ex.Message + ex.StackTrace); return; } var tokenizer = new TextTokenizer(text); var tokens = tokenizer.Tokenize(); TernaryIndex.ReadSequence(tokens, ClusterPreWindow, ClusterPostWindow); }); TernaryIndex.SimilarityIndex.AddRange(TernaryIndex.BaseWords.ToArray()); //TernaryIndex.SimilarityIndex.BuildIndex(TernaryIndex.Contexts.Cast<ITernaryVector>().ToArray()); } catch (Exception ex) { MessageBox.Show(ex.Message + ex.StackTrace); } }
public async Task <TextAnalysisResult> AnalyzeTextAsync(string text) { // See: https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/quickstarts/csharp var analysisResult = new TextAnalysisResult(); if (string.IsNullOrEmpty(text)) { Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): no text to analyze"); return(analysisResult); } string textToAnalyze = text; if (text.Length > 5000) { Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): text longer than supported length. Trimming it..."); textToAnalyze = text.Substring(0, 5000); } Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): initializing TextAnalyticsAPI"); ITextAnalyticsAPI m_TextAnalyticsClient = new TextAnalyticsAPI { AzureRegion = AzureRegions.Westeurope, SubscriptionKey = m_TextAnalyticsAPISubscriptionKey }; Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): detecting content language"); var batchLanguageResult = await m_TextAnalyticsClient.DetectLanguageAsync(new BatchInput(new List <Input>() { new Input("1", textToAnalyze) })).ConfigureAwait(false); if (batchLanguageResult.Errors.Count > 0) { Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): error while detecting language"); foreach (var errors in batchLanguageResult.Errors) { Console.WriteLine($"\t{errors.Message}"); } return(analysisResult); } analysisResult.DetectedLanguage = batchLanguageResult.Documents[0].DetectedLanguages[0].Name; analysisResult.DetectedLanguageScore = batchLanguageResult.Documents[0].DetectedLanguages[0].Score.GetValueOrDefault(); Console.WriteLine($"\t\t\tContentAnalyzer.AnalyzeTextAsync(): detected language is '{analysisResult.DetectedLanguage}' ({(analysisResult.DetectedLanguageScore * 100):0.00}%)"); Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): performing key-phrase extraction"); var multiLanguageInput = new MultiLanguageBatchInput(new List <MultiLanguageInput>() { new MultiLanguageInput(batchLanguageResult.Documents[0].DetectedLanguages[0].Iso6391Name, "1", textToAnalyze) }); var batchKeyphraseResult = await m_TextAnalyticsClient.KeyPhrasesAsync(multiLanguageInput).ConfigureAwait(false); if (batchKeyphraseResult.Errors.Count > 0) { Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): error while extracting key-phrases"); foreach (var errors in batchKeyphraseResult.Errors) { Console.WriteLine($"\t\t\t\t{errors.Message}"); } return(analysisResult); } Console.WriteLine($"\t\t\tContentAnalyzer.AnalyzeTextAsync(): retrieved {batchKeyphraseResult.Documents[0].KeyPhrases.Count} key-phrases:"); foreach (var keyphrase in batchKeyphraseResult.Documents[0].KeyPhrases) { analysisResult.KeyPhrases.Add(keyphrase); Console.WriteLine($"\t\t\t\t{keyphrase}"); } Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): performing sentiment analysis"); var batchSentimentResult = await m_TextAnalyticsClient.SentimentAsync(multiLanguageInput).ConfigureAwait(false); if (batchSentimentResult.Errors.Count > 0) { Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): error while detecting sentiment"); foreach (var errors in batchSentimentResult.Errors) { Console.WriteLine($"\t\t\t\t{errors.Message}"); } return(analysisResult); } analysisResult.SentimentScore = batchSentimentResult.Documents[0].Score.GetValueOrDefault(); analysisResult.Sentiment = GetSentiment(analysisResult.SentimentScore); Console.WriteLine($"\t\t\tContentAnalyzer.AnalyzeTextAsync(): sentiment is '{analysisResult.Sentiment}' ({(analysisResult.SentimentScore * 100):0.00}%)"); // Extend analysis by estimating reading time for content analysisResult.WordCount = TextTokenizer.GetWordCount(text); analysisResult.ReadingTimeInMinutes = ReadingTimeEstimator.GetEstimatedReadingTime(analysisResult.WordCount, analysisResult.DetectedLanguage); return(analysisResult); }
private static List <IToken> Tokenize(string text) { ITextTokenizer tokenizer = new TextTokenizer(new TextHelper()); return(tokenizer.Tokenize(text).ToList()); }