TextTokenizer C# (CSharp) Code Examples

Example #1

0

Show file

        public void Test_Correct_TokenType()
        {
            string input = "everything (over 1.11 + 1)&& ;<";

            var tokens = TextTokenizer.TextTokenize(input);

            Assert.Equal(TextTokenType.Word, tokens[0].Type);

            Assert.Equal(TextTokenType.ParentheseStart, tokens[1].Type);

            Assert.Equal(TextTokenType.Word, tokens[2].Type);

            Assert.Equal(TextTokenType.Number, tokens[3].Type);

            Assert.Equal(TextTokenType.Operator, tokens[4].Type);

            Assert.Equal(TextTokenType.Number, tokens[5].Type);

            Assert.Equal(TextTokenType.ParentheseStop, tokens[6].Type);

            Assert.Equal(TextTokenType.Operator, tokens[7].Type);

            Assert.Equal(TextTokenType.EndOfSentence, tokens[8].Type);

            Assert.Equal(TextTokenType.Operator, tokens[9].Type);
        }

Example #2

0

Show file

        private List <Document> preprocessDataset(String directoryUrl)
        {
            List <Document> dataset = new List <Document>();

            string baseDirPath = Path.GetDirectoryName(Path.GetDirectoryName(System.IO.Directory.GetCurrentDirectory()));

            foreach (string file in Directory.EnumerateFiles(baseDirPath + @"\dataset", "*.json"))
            {
                string         json     = File.ReadAllText(file);
                List <DocItem> docItems = JsonConvert.DeserializeObject <List <DocItem> >(json);

                Document document;

                foreach (var item in docItems)
                {
                    if (item.topics == null || item.topics.Length < 0)
                    {
                        continue;
                    }
                    //for each doc - tokenize its body and convert it into a Document object.
                    document            = TextTokenizer.tokenize(item.title + " " + item.body);
                    document.categories = item.topics.ToList <String>();
                    dataset.Add(document);
                }
            }
            return(dataset);
        }

Example #3

0

Show file

        public void Test_Correct_TokenContent()
        {
            string input = "everything is over 1.11 + 1 && 1 < 1";

            var tokens = TextTokenizer.TextTokenize(input);

            Assert.Equal("everything", new String(tokens[0].RawData.ToArray()));

            Assert.Equal("is", new String(tokens[1].RawData.ToArray()));

            Assert.Equal("over", new String(tokens[2].RawData.ToArray()));

            Assert.Equal("1.11", new String(tokens[3].RawData.ToArray()));

            Assert.Equal("+", new String(tokens[4].RawData.ToArray()));

            Assert.Equal("1", new String(tokens[5].RawData.ToArray()));

            Assert.Equal("&&", new String(tokens[6].RawData.ToArray()));

            Assert.Equal("1", new String(tokens[7].RawData.ToArray()));

            Assert.Equal("<", new String(tokens[8].RawData.ToArray()));

            Assert.Equal("1", new String(tokens[9].RawData.ToArray()));
        }

Example #4

0

Show file

File: WordsController.cs Project: bill-cooper/language-study

        public async Task <IEnumerable <string> > Audios([FromBody] TextBody textBody)
        {
            var tokenizer = new TextTokenizer();
            var tokens    = tokenizer.GetWords(textBody.Input);
            var audios    = new List <string>();

            foreach (var token in tokens)
            {
                var composition = new Composition {
                    Return = new ContentSegment {
                        Url = $"https://forvo.com/word/{token.Value.RemoveAccents()}/#ru", Select = "span.play"
                    }
                };
                var elements = await composition.Return.DocumentElement();

                foreach (var element in elements)
                {
                    var onclick      = element.GetAttribute("onclick");
                    var onclickParts = onclick.Split(',');
                    if (onclickParts.Count() >= 5)
                    {
                        audios.Add(Encoding.UTF8.GetString(Convert.FromBase64String(onclickParts[4].Trim(new[] { '\'', '"' }))));
                    }

                    if (audios.Count >= 5)
                    {
                        break;
                    }
                }
            }
            return(audios);
        }

Example #5

0

Show file

File: NaiveBayes.cs Project: waliul-cse/NBayesDotNet

        /// <summary>
        /// Preprocesses the original dataset and converts it to a List of Documents.
        /// </summary>
        /// <param name="trainingDataset"> </param>
        /// <returns>  </returns>
        private IList <Document> preprocessDataset(IDictionary <string, String[]> trainingDataset)
        {
            IList <Document> dataset = new List <Document>();

            string category;

            string[] examples;

            Document doc;

            IEnumerator <KeyValuePair <string, String[]> > it = trainingDataset.GetEnumerator();

            //loop through all the categories and training examples
            while (it.MoveNext())
            {
                KeyValuePair <string, String[]> entry = it.Current;
                category = entry.Key;
                examples = entry.Value;

                for (int i = 0; i < examples.Length; ++i)
                {
                    //for each example in the category tokenize its text and convert it into a Document object.
                    doc          = TextTokenizer.tokenize(examples[i]);
                    doc.category = category;
                    dataset.Add(doc);

                    //examples[i] = null; //try freeing some memory
                }

                //it.remove(); //try freeing some memory
            }

            return(dataset);
        }

Example #6

0

Show file

        public List <String> predict(String text, int topKCategories = 3)
        {
            if (knowledgeBase == null)
            {
                throw new ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it.");
            }

            //Tokenizes the text and creates a new document
            Document doc = TextTokenizer.tokenize(text);
            double   occurrences;

            //String maxScoreCategory = null;
            //Double maxScore = Double.MinValue;

            Dictionary <String, double> predictionScores = new Dictionary <string, double>();

            foreach (var categoryCounts in knowledgeBase.logPriors)
            {
                double logprob = categoryCounts.Value;
                //foreach feature of the document
                foreach (var tokenCount in doc.tokens)
                {
                    if (!knowledgeBase.logConditionalProbability.ContainsKey(tokenCount.Key))
                    {
                        continue; //if the feature does not exist just skip it
                    }

                    occurrences = tokenCount.Value; //get its occurrences in text

                    if (knowledgeBase.logConditionalProbability[tokenCount.Key].ContainsKey(categoryCounts.Key))
                    {
                        logprob += knowledgeBase.logConditionalProbability[tokenCount.Key][categoryCounts.Key]; //multiply loglikelihood score with occurrences
                    }
                }
                predictionScores.Add(categoryCounts.Key, logprob);

                //if (categoryCounts.Value > maxScore)
                //{
                //    maxScore = categoryCounts.Value;
                //    maxScoreCategory = categoryCounts.Key;
                //}
            }

            var list = predictionScores.ToList();

            list.Sort((pair1, pair2) => { return(pair2.Value.CompareTo(pair1.Value)); });
            List <string> result = new List <string>();

            foreach (var l in list)
            {
                if (l.Value > 0.0)
                {
                    result.Add(l.Key);
                }
            }
            return(result.Count >= topKCategories?result.GetRange(0, topKCategories) : result);  //return the categoies with positive odds
        }

Example #7

0

Show file

File: TestTextTokenizer.cs Project: kwetril/DocumentSearcher

        public void TestSplitToWords()
        {
            TextTokenizer tokenizer = new TextTokenizer();

            string text1 = "Я, ты, он, ,она - оно! Они: их 123, им?";

            string[] expected1 = new string[] { "Я", "ты", "он", "она", "оно", "Они", "их", "им" };
            string[] result1   = tokenizer.SplitToWords(text1);
            CollectionAssert.AreEqual(result1, expected1);
        }

Example #8

0

Show file

File: QueryExpression.cs Project: hnrt/TextSearch

 public static QueryExpression Parse(string text)
 {
     using (var sr = new StringReader(text))
     {
         var tokenizer = new TextTokenizer(TextTokenizer.QUERY_MODE);
         tokenizer.Run(sr);
         var parser = new Parser();
         return(parser.Run(tokenizer.Texts));
     }
 }

Example #9

0

Show file

File: CrawledTrieIssueExtensions.cs Project: ScriptBox21/issuesof.net

        private static void AddTermsFromPlainText(ISet <string> target, string text)
        {
            if (string.IsNullOrEmpty(text))
            {
                return;
            }

            var tokens = TextTokenizer.Tokenize(text);

            target.UnionWith(tokens);
        }

Example #10

0

Show file

        public void Order_Of_Adding_And_Multiplyging_Is_Correct2()
        {
            string input = "1+2*3;";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            Assert.Equal(ExpressionType.BinaryOperation, AST.Type);
            Assert.Equal(BinaryExpressionType.Add, ((BinaryExpressionNode)AST).BinaryType);
        }

Example #11

0

Show file

File: NaiveBayes.cs Project: waliul-cse/NBayesDotNet

        /// <summary>
        /// Predicts the category of a text by using an already trained classifier
        /// and returns its category.
        /// </summary>
        /// <param name="text"> </param>
        /// <returns> </returns>
        /// <exception cref="IllegalArgumentException"> </exception>
        public virtual string predict(string text)
        {
            if (knowledgeBase == null)
            {
                throw new System.ArgumentException("Knowledge Bases missing: Make sure you train first a classifier before you use it.");
            }

            //Tokenizes the text and creates a new document
            Document doc = TextTokenizer.tokenize(text);


            string category;
            string feature;
            int    occurrences;
            double?logprob;

            string maxScoreCategory = null;
            double?maxScore         = double.NegativeInfinity;

            //Map<String, Double> predictionScores = new HashMap<>();
            foreach (KeyValuePair <string, double> entry1 in knowledgeBase.logPriors)
            {
                category = entry1.Key;
                logprob  = entry1.Value; //intialize the scores with the priors

                //foreach feature of the document
                foreach (KeyValuePair <string, int> entry2 in doc.tokens)
                {
                    feature = entry2.Key;

                    if (!knowledgeBase.logLikelihoods.ContainsKey(feature))
                    {
                        continue; //if the feature does not exist in the knowledge base skip it
                    }

                    occurrences = entry2.Value;                                               //get its occurrences in text

                    logprob += occurrences * knowledgeBase.logLikelihoods[feature][category]; //multiply loglikelihood score with occurrences
                }
                //predictionScores.put(category, logprob);

                if (logprob > maxScore)
                {
                    maxScore         = logprob;
                    maxScoreCategory = category;
                }
            }

            return(maxScoreCategory); //return the category with heighest score
        }

Example #12

0

Show file

        private static void Apply(IssueFilter result, BoundTextQuery expression)
        {
            var terms = TextTokenizer.Tokenize(expression.Text);

            foreach (var term in terms)
            {
                if (expression.IsNegated)
                {
                    result.ExcludedTerms.Add(term);
                }
                else
                {
                    result.IncludedTerms.Add(term);
                }
            }
        }

Example #13

0

Show file

        public void One_Plus_One_Is_Two()
        {
            string input = "1 + 1";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            global::BRE.ExecutionContext context = new global::BRE.ExecutionContext();

            ExecutionEngine executionEngine = new ExecutionEngine(context);

            executionEngine.ExecuteExpression(AST);

            Assert.Equal(2.0, (double)AST.EvaluatedValue.Value);
        }

Example #14

0

Show file

        public void Basic_Boolean_Logic_Got_Correct_AST()
        {
            string input = "1 + 3 > 1 && 1 + 1 > 0";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            var BinaryAst = (BinaryExpressionNode)AST;

            var left = (BinaryExpressionNode)BinaryAst.Left;

            var right = (BinaryExpressionNode)BinaryAst.Right;

            Assert.Equal(BinaryExpressionType.GreaterThan, left.BinaryType);
            Assert.Equal(BinaryExpressionType.GreaterThan, right.BinaryType);
        }

Example #15

0

Show file

File: TextRankSummarizer.cs Project: regme/Korzh.NLP

        public override string Summarize(string text, string lang = "en")
        {
            var sentences = SplitTextOnSentences(text);

            int summarySize = 3;

            if (sentences.Count <= summarySize)
            {
                return(text);
            }

            _stopWordFilter = _nlpServiceProvider.GetStopWordFilter(lang);

            var stemmer = _nlpServiceProvider.GetStemmer(lang);

            var tokenizedSentences = new List <IList <string> >();

            foreach (var sentence in sentences)
            {
                var tokenizer = new TextTokenizer(sentence, filterMapper: new TextFilterMapper {
                    Map = (t) => stemmer.Stem(t)
                });
                tokenizedSentences.Add(tokenizer.ToList());
            }

            var matrix = BuildSimilarityMatrix(tokenizedSentences);

            var graph = BuildDirectedGraph(matrix);

            var result = new PageRank()
                         .Rank(graph)
                         .OrderBy(kv => kv.Value);    //Less value, better result

            var summary      = "";
            var topSentances = result.Take(summarySize).OrderBy(kv => kv.Key); //Sentences order in text

            foreach (var topSent in topSentances)
            {
                summary += sentences[topSent.Key] + ". ";
            }

            return(summary);
        }

Example #16

0

Show file

File: CrawledIndexCompletionProvider.cs Project: ScriptBox21/issuesof.net

        public CrawledIndexCompletionProvider(CrawledIndex index)
        {
            _orgs = new SortedSet <string>(
                index.Repos.Select(r => r.Org),
                StringComparer.OrdinalIgnoreCase
                ).ToArray();

            _repos = new SortedSet <string>(
                index.Repos.SelectMany(r => new[] { r.Name, r.FullName }),
                StringComparer.OrdinalIgnoreCase
                ).ToArray();

            _users = new SortedSet <string>(
                index.Repos.SelectMany(r => r.Issues.Values)
                .SelectMany(i => new[] { i.CreatedBy }.Concat(i.Assignees)),
                StringComparer.OrdinalIgnoreCase
                ).ToArray();

            _labels = new SortedSet <string>(
                index.Repos.SelectMany(r => r.Labels)
                .Select(l => l.Name),
                StringComparer.OrdinalIgnoreCase
                ).ToArray();

            _milestones = new SortedSet <string>(
                index.Repos.SelectMany(r => r.Milestones)
                .Select(m => m.Title),
                StringComparer.OrdinalIgnoreCase
                ).ToArray();

            _areaPaths = new SortedSet <string>(
                index.Repos.SelectMany(r => r.Labels)
                .SelectMany(l => TextTokenizer.GetAreaPaths(l.Name)),
                StringComparer.OrdinalIgnoreCase
                ).ToArray();

            _areaNodes = new SortedSet <string>(
                index.Repos.SelectMany(r => r.Labels)
                .SelectMany(l => TextTokenizer.GetAreaPaths(l.Name, segmentsOnly: true)),
                StringComparer.OrdinalIgnoreCase
                ).ToArray();
        }

Example #17

0

Show file

File: WordsController.cs Project: bill-cooper/language-study

        public async Task <Translation> Translation([FromBody] TextBody textBody)
        {
            var translation = new Translation();

            using (var service = new TranslateService(new BaseClientService.Initializer
            {
                ApiKey = "AIzaSyBiPTuvLggID2YrmBshuHBZhij6HeFOxko",
                ApplicationName = "Project Name"
            }))
            {
                var input = textBody.Input;

                if (input.StartsWith("en:"))
                {
                    var russianResponse = await service.Translations.List(new string[] { input.Replace("en:", "") }, "ru").ExecuteAsync();

                    input = russianResponse.Translations[0].TranslatedText;
                }


                var tokenizer = new TextTokenizer();
                var blocks    = tokenizer.GetBlocks(input);

                var blockResponse = await service.Translations.List(blocks.Select(t => t.OriginalText).ToArray(), "en").ExecuteAsync();


                for (int i = 0; i < blockResponse.Translations.Count; i++)
                {
                    blocks[i].TranslatedText = blockResponse.Translations[i].TranslatedText;
                    var words        = tokenizer.GetWords(blocks[i].OriginalText);
                    var wordResponse = await service.Translations.List(words.Select(t => t.Value).ToArray(), "en").ExecuteAsync();

                    for (int j = 0; j < wordResponse.Translations.Count; j++)
                    {
                        words[j].Translation = wordResponse.Translations[j].TranslatedText;
                        blocks[i].Words.Add(words[j]);
                    }
                    translation.Blocks.Add(blocks[i]);
                }
                return(translation);
            }
        }

Example #18

0

Show file

File: Test.cs Project: zVolodymyr/Tokenizer

        public void TestCase()
        {
            var text      = "This is  {test} \"te\\\"xt\".";
            var tokenizer = new TextTokenizer(text,
                                              WordRule.Default, WhitespaceRule.Default, StringRule.Default,
                                              new SingleCharRule('{'), new SingleCharRule('}'), new SingleCharRule('.'));

            var tokens = tokenizer.ReadToEnd();

            Assert.AreEqual(tokens [0].Rule, WordRule.Default);
            Assert.AreEqual(tokens [1].Rule, WhitespaceRule.Default);
            Assert.AreEqual(tokens [2].Rule.GetType(), typeof(WordRule));
            Assert.AreEqual(tokens [3].Text, "  ");
            Assert.AreEqual(tokens [4].Rule.GetType(), typeof(SingleCharRule));
            Assert.AreEqual(tokens [5].Text, "test");
            Assert.AreEqual(tokens [6].Text, "}");
            Assert.AreEqual(tokens [7].Rule.GetType(), typeof(WhitespaceRule));
            Assert.AreEqual(tokens [8].Rule, StringRule.Default);
            Assert.AreEqual(tokens [8].Text, "te\"xt");
        }

Example #19

0

Show file

        public void Variables_Can_Be_Found_By_Name()
        {
            string input = "1+Apple";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            var BinaryAst = (BinaryExpressionNode)AST;

            var left = (ConstantExpression)BinaryAst.Left;

            var right = (VariableExpression)BinaryAst.Right;

            Assert.Equal(BinaryExpressionType.Add, BinaryAst.BinaryType);
            Assert.Equal(1.0, (double)left.Value.Value);
            Assert.Equal("Apple", right.VariableValue.VariableName);
        }

Example #20

0

Show file

        public void One_Plus_Two_Is_Greater_Than_Two()
        {
            string input = "1 + 2 > 2";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            global::BRE.ExecutionContext context = new global::BRE.ExecutionContext();

            ExecutionEngine executionEngine = new ExecutionEngine(context);

            executionEngine.ExecuteExpression(AST);

            Assert.Equal(ValueType.Logical, AST.EvaluatedValue.Type);

            Assert.Equal(true, (bool)AST.EvaluatedValue.Value);
        }

Example #21

0

Show file

        public void Add_Operator_Has_Higher_Precedence_Than_Adding()
        {
            string input = "1 && 1 + 1";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            var BinaryAst = (BinaryExpressionNode)AST;

            var left = (ConstantExpression)BinaryAst.Left;

            var right = (BinaryExpressionNode)BinaryAst.Right;

            Assert.Equal(BinaryExpressionType.And, BinaryAst.BinaryType);
            Assert.Equal(1.0, (double)left.Value.Value);
            Assert.Equal(BinaryExpressionType.Add, right.BinaryType);
        }

Example #22

0

Show file

        public void Variable_Can_Be_Found_From_Context()
        {
            string input = "Word";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            global::BRE.ExecutionContext context = new global::BRE.ExecutionContext();

            context.AddVariableValue("word", 2.313);

            ExecutionEngine executionEngine = new ExecutionEngine(context);

            executionEngine.ExecuteExpression(AST);

            Assert.Equal(ValueType.Number, AST.EvaluatedValue.Type);

            Assert.Equal(2.313, (double)AST.EvaluatedValue.Value);
        }

Example #23

0

Show file

        public void Parentheses_Affect_The_Order_Of_AST()
        {
            string input = "1*(2+2.2)+3;";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            var BinaryAst = (BinaryExpressionNode)AST;

            var left = (BinaryExpressionNode)BinaryAst.Left;

            var leftRight = (ExpressionNode)left.Right;

            var targetExpression = (BinaryExpressionNode)leftRight.Expression;

            Assert.Equal(BinaryExpressionType.Add, targetExpression.BinaryType);
            Assert.Equal(2.0, (double)targetExpression.Left.Value.Value);
            Assert.Equal(2.2, (double)targetExpression.Right.Value.Value);
        }

Example #24

0

Show file

        public void Ternary_Expression_Parsed_Correctly()
        {
            string input = " 1 == 2 ? 2 : 3";

            var TextTokens = TextTokenizer.TextTokenize(input);

            Parser parsers = new Parser(TextTokens);

            var AST = parsers.ParseExpression();

            var ternary = (TernaryExpressionNode)AST;

            var first = (BinaryExpressionNode)ternary.First;

            var second = (ConstantExpression)ternary.Second;

            var third = (ConstantExpression)ternary.Third;


            Assert.Equal(BinaryExpressionType.Equal, first.BinaryType);
            Assert.NotNull(second);
            Assert.NotNull(third);
        }

Example #25

0

Show file

File: HtmlTagReader.cs Project: Eruru3510/Eruru.Html

        bool ReadNode(HtmlElement parentElement, out HtmlNode node, string endElementLocalName = null)
        {
            if (MoveNext())
            {
                switch (Current.Type)
                {
                case HtmlTagType.Define:
                    node = new HtmlDocumentType(Current.Attributes, parentElement);
                    return(true);

                case HtmlTagType.Single:
                    node = new HtmlElement(Current.Name, Current.Attributes, parentElement);
                    return(true);

                case HtmlTagType.Start: {
                    HtmlElement element = new HtmlElement(Current.Name, Current.Attributes, parentElement);
                    node = element;
                    if (HtmlAPI.IsContentTag(element.LocalName))
                    {
                        TextTokenizer.SkipIgnoreCharacters();
                        string text = TextTokenizer.ReadTo($"</{element.LocalName}>").TrimEnd();
                        if (text.Length > 0)
                        {
                            node.ChildNodes.Add(new HtmlText(text, element));
                        }
                        return(true);
                    }
                    Tags.Add(element.LocalName);
                    HtmlNode lastNode = null;
                    while (ReadNode(element, out HtmlNode childNode, element.LocalName))
                    {
                        if (childNode is null)
                        {
                            continue;
                        }
                        if (lastNode != null)
                        {
                            lastNode.NextSibling      = childNode;
                            childNode.PreviousSibling = lastNode;
                        }
                        node.ChildNodes.Add(childNode);
                        lastNode = childNode;
                    }
                    return(true);
                }

                case HtmlTagType.End:
                    if (endElementLocalName != null && !HtmlAPI.Equals(Current.Name, endElementLocalName))
                    {
                        if (Tags.Contains(Current.Name))
                        {
                            Buffer.Push(Current);
                        }
                        else
                        {
                            node = null;
                            return(true);
                        }
                    }
                    if (Tags.Count > 0)
                    {
                        Tags.RemoveAt(Tags.Count - 1);
                    }
                    break;

                case HtmlTagType.Text:
                    node = new HtmlText(HtmlAPI.Unescape(Current.Content), parentElement);
                    return(true);

                case HtmlTagType.Comment:
                    node = new HtmlComment(Current.Content, parentElement);
                    return(true);

                default:
                    throw new NotImplementedException(Current.Type.ToString());
                }
            }
            node = null;
            return(false);
        }

Example #26

0

Show file

 public void TokenizerShouldConstruct()
 {
     ITextTokenizer textTokenizer = new TextTokenizer(new TextHelper());
 }

Example #27

0

Show file

File: WordCounter.cs Project: McBits/LanguageLib

        private static IEnumerable<string> GetFileTokens(string fileName, bool ignoreCase)
        {
            string text = null;
            try
            {
                text = File.ReadAllText(fileName);
            }
            catch (IOException ex)
            {
                Debug.WriteLine(ex.Message + ex.StackTrace);
            }

            if (text != null)
            {
                var tokenizer = new TextTokenizer(text);
                var tokens = tokenizer.Tokenize();

                var filteredWords = tokens.Select(token => ignoreCase ? token.ToLower() : token);

                foreach (var filteredWord in filteredWords)
                {
                    yield return filteredWord;
                }
            }
        }

Example #28

0

Show file

File: Test.cs Project: zVolodymyr/Tokenizer

            public object ParseValue(TextTokenizer tokenizer, ITokenRule endToken)
            {
                var required = endToken == null ? new [] { word, text, startObject, startArray } : new [] { word, text, startObject, startArray, endToken };

                var token = tokenizer.NextToken(required: required);

                if (token.Rule == word)
                {
                    var boolValue   = false;
                    var numberValue = 0.0d;

                    if (bool.TryParse(token.Text, out boolValue))
                    {
                        return(boolValue);
                    }

                    if (double.TryParse(token.Text, out numberValue))
                    {
                        return(numberValue);
                    }

                    throw new NotSupportedException("not supported token");
                }
                else if (token.Rule == text)
                {
                    return(token.Text);
                }
                else if (token.Rule == startArray)
                {
                    var list = new List <object> ();

                    object val = null;

                    while ((val = ParseValue(tokenizer, endArray)) != null)
                    {
                        list.Add(val);

                        if (tokenizer.NextToken(required: new [] { endArray, comma }).Rule == endArray)
                        {
                            break;
                        }
                    }

                    return(list.ToArray());
                }
                else if (token.Rule == startObject)
                {
                    var obj = new Dictionary <string, object> ();

                    do
                    {
                        var prop = tokenizer.NextToken(new [] { word, endObject });

                        if (prop.Rule == endObject)
                        {
                            break;
                        }

                        tokenizer.NextToken(new [] { colon });

                        obj.Add(prop.Text, ParseValue(tokenizer, null));

                        token = tokenizer.NextToken(new [] { comma, endObject });
                    } while(token.Rule != endObject);

                    return(obj);
                }

                return(null);
            }

Example #29

0

Show file

        public async Task <ImageAnalysisResult> AnalyzeImageAsync(string url)
        {
            var analysisResult = new ImageAnalysisResult();

            try
            {
                // USING Microsoft provided VisionClientLibrary seems not working in NET Core as-is, a fix is required for ExpandoObject
                // see: https://github.com/Microsoft/Cognitive-Vision-DotNetCore/pull/1/commits/9c4647edb400aecd4def330537d5bcd74f126111

                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): initializing VisionAPI client");

                var visionApiClient = new VisionServiceClient(m_VisionAPISubscriptionKey, "https://westeurope.api.cognitive.microsoft.com/vision/v1.0");

                var visualFeatures = new List <VisualFeature> {
                    VisualFeature.Adult, VisualFeature.Categories, VisualFeature.Color, VisualFeature.Description, VisualFeature.Faces, VisualFeature.ImageType                                           /*, VisualFeature.Tags */
                };
                var details = new List <string> {
                    "Celebrities", "Landmarks"
                };

                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): started image analysis");

                var visionApiResult = await visionApiClient.AnalyzeImageAsync(url, visualFeatures, details).ConfigureAwait(false);

                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): executing OCR");

                var ocrResult = await visionApiClient.RecognizeTextAsync(url).ConfigureAwait(false);

                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): performing tag identification");

                var tagsResult = await visionApiClient.GetTagsAsync(url).ConfigureAwait(false);

                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeImageAsync(): analysis completed");

                // Mapping VisionAPI Client entity to domain entity
                analysisResult.AdultContent = new ImageAnalysisAdultContentResult {
                    AdultScore = visionApiResult.Adult.AdultScore, IsAdultContent = visionApiResult.Adult.IsAdultContent, IsRacyContent = visionApiResult.Adult.IsRacyContent, RacyScore = visionApiResult.Adult.RacyScore
                };
                analysisResult.Colors = new ImageAnalysisColorResult {
                    AccentColor = visionApiResult.Color.AccentColor, DominantColorBackground = visionApiResult.Color.DominantColorBackground, DominantColorForeground = visionApiResult.Color.DominantColorForeground, IsBWImg = visionApiResult.Color.IsBWImg
                };
                analysisResult.Categories = visionApiResult.Categories.Select(c => new ImageAnalysisCategoryResult {
                    Text = c.Name, Score = c.Score
                }).OrderByDescending(c => c.Score).ToList();
                analysisResult.Descriptions = visionApiResult.Description.Captions.Select(c => new ImageAnalysisDescriptionResult {
                    Text = c.Text, Score = c.Confidence
                }).OrderByDescending(c => c.Score).ToList();

                // Merge detected tags from image analysis and image tags
                analysisResult.Tags = tagsResult.Tags.Select(t => new ImageAnalysisTagResult {
                    Text = t.Name, Score = t.Confidence, Hint = t.Hint
                }).ToList();
                foreach (var t in visionApiResult.Description.Tags)
                {
                    analysisResult.Tags.Add(new ImageAnalysisTagResult {
                        Text = t, Score = 0.0, Hint = string.Empty
                    });
                }

                analysisResult.Faces = visionApiResult.Faces.Select(f => new ImageAnalysisFaceResult {
                    Age = f.Age, Gender = f.Gender == "Male" ? Gender.Male : f.Gender == "female" ? Gender.Female : Gender.Unknown
                }).ToList();
                analysisResult.Text = ocrResult.Regions.Select(r => new ImageAnalysisTextResult()
                {
                    Language = ocrResult.Language, Orientation = ocrResult.Orientation, TextAngle = ocrResult.TextAngle.GetValueOrDefault(), Text = string.Join(" ", r.Lines.Select(l => string.Join(" ", l.Words.Select(w => w.Text))))
                }).ToList();

                // Extend analysis by estimating reading time for each transcribed text
                foreach (var text in analysisResult.Text)
                {
                    text.WordCount                        = TextTokenizer.GetWordCount(text.Text);
                    text.ReadingTimeInMinutes             = ReadingTimeEstimator.GetEstimatedReadingTime(text.WordCount, text.Language);
                    analysisResult.WatchingTimeInMinutes += text.ReadingTimeInMinutes;
                }

                // Add an additional default time for estimating how long it will take to the user to watch the picture
                analysisResult.WatchingTimeInMinutes += DefaultImageWatchingTime;
            }
            catch (Exception ex)
            {
                Console.WriteLine($"\t\t\tContentAnalyzer.AnalyzeImageAsync(): an error occured while analyzing image - {ex.Message}");
            }

            return(analysisResult);
        }

Example #30

0

Show file

File: TernaryIndexViewModel.cs Project: McBits/WordTools

        /// <summary>
        /// Performs clustering for the selected language using the current parameters.
        /// </summary>
        private void Cluster()
        {
            try
            {
                TernaryIndex = new TernaryIndex(ClusterVectorSize, ClusterVectorDensity);

                Parallel.ForEach(DocFiles, docFile =>
                {
                    string text;
                    try
                    {
                        text = File.ReadAllText(Path.Combine(CommonFiles.DocsPath(LanguageId), docFile));
                    }
                    catch (IOException ex)
                    {
                        Debug.WriteLine(ex.Message + ex.StackTrace);
                        return;
                    }

                    var tokenizer = new TextTokenizer(text);
                    var tokens = tokenizer.Tokenize();
                    TernaryIndex.ReadSequence(tokens, ClusterPreWindow, ClusterPostWindow);
                });

                TernaryIndex.SimilarityIndex.AddRange(TernaryIndex.BaseWords.ToArray());

                //TernaryIndex.SimilarityIndex.BuildIndex(TernaryIndex.Contexts.Cast<ITernaryVector>().ToArray());
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message + ex.StackTrace);
            }
        }

Example #31

0

Show file

        public async Task <TextAnalysisResult> AnalyzeTextAsync(string text)
        {
            // See: https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/quickstarts/csharp

            var analysisResult = new TextAnalysisResult();

            if (string.IsNullOrEmpty(text))
            {
                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): no text to analyze");
                return(analysisResult);
            }

            string textToAnalyze = text;

            if (text.Length > 5000)
            {
                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): text longer than supported length. Trimming it...");
                textToAnalyze = text.Substring(0, 5000);
            }

            Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): initializing TextAnalyticsAPI");

            ITextAnalyticsAPI m_TextAnalyticsClient = new TextAnalyticsAPI
            {
                AzureRegion     = AzureRegions.Westeurope,
                SubscriptionKey = m_TextAnalyticsAPISubscriptionKey
            };

            Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): detecting content language");

            var batchLanguageResult = await m_TextAnalyticsClient.DetectLanguageAsync(new BatchInput(new List <Input>()
            {
                new Input("1", textToAnalyze)
            })).ConfigureAwait(false);

            if (batchLanguageResult.Errors.Count > 0)
            {
                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): error while detecting language");
                foreach (var errors in batchLanguageResult.Errors)
                {
                    Console.WriteLine($"\t{errors.Message}");
                }
                return(analysisResult);
            }

            analysisResult.DetectedLanguage      = batchLanguageResult.Documents[0].DetectedLanguages[0].Name;
            analysisResult.DetectedLanguageScore = batchLanguageResult.Documents[0].DetectedLanguages[0].Score.GetValueOrDefault();

            Console.WriteLine($"\t\t\tContentAnalyzer.AnalyzeTextAsync(): detected language is '{analysisResult.DetectedLanguage}' ({(analysisResult.DetectedLanguageScore * 100):0.00}%)");

            Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): performing key-phrase extraction");

            var multiLanguageInput = new MultiLanguageBatchInput(new List <MultiLanguageInput>()
            {
                new MultiLanguageInput(batchLanguageResult.Documents[0].DetectedLanguages[0].Iso6391Name, "1", textToAnalyze)
            });
            var batchKeyphraseResult = await m_TextAnalyticsClient.KeyPhrasesAsync(multiLanguageInput).ConfigureAwait(false);

            if (batchKeyphraseResult.Errors.Count > 0)
            {
                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): error while extracting key-phrases");
                foreach (var errors in batchKeyphraseResult.Errors)
                {
                    Console.WriteLine($"\t\t\t\t{errors.Message}");
                }
                return(analysisResult);
            }

            Console.WriteLine($"\t\t\tContentAnalyzer.AnalyzeTextAsync(): retrieved {batchKeyphraseResult.Documents[0].KeyPhrases.Count} key-phrases:");
            foreach (var keyphrase in batchKeyphraseResult.Documents[0].KeyPhrases)
            {
                analysisResult.KeyPhrases.Add(keyphrase);
                Console.WriteLine($"\t\t\t\t{keyphrase}");
            }

            Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): performing sentiment analysis");

            var batchSentimentResult = await m_TextAnalyticsClient.SentimentAsync(multiLanguageInput).ConfigureAwait(false);

            if (batchSentimentResult.Errors.Count > 0)
            {
                Console.WriteLine("\t\t\tContentAnalyzer.AnalyzeTextAsync(): error while detecting sentiment");
                foreach (var errors in batchSentimentResult.Errors)
                {
                    Console.WriteLine($"\t\t\t\t{errors.Message}");
                }
                return(analysisResult);
            }

            analysisResult.SentimentScore = batchSentimentResult.Documents[0].Score.GetValueOrDefault();
            analysisResult.Sentiment      = GetSentiment(analysisResult.SentimentScore);

            Console.WriteLine($"\t\t\tContentAnalyzer.AnalyzeTextAsync(): sentiment is '{analysisResult.Sentiment}' ({(analysisResult.SentimentScore * 100):0.00}%)");

            // Extend analysis by estimating reading time for content
            analysisResult.WordCount            = TextTokenizer.GetWordCount(text);
            analysisResult.ReadingTimeInMinutes = ReadingTimeEstimator.GetEstimatedReadingTime(analysisResult.WordCount, analysisResult.DetectedLanguage);

            return(analysisResult);
        }

Example #32

0

Show file

        private static List <IToken> Tokenize(string text)
        {
            ITextTokenizer tokenizer = new TextTokenizer(new TextHelper());

            return(tokenizer.Tokenize(text).ToList());
        }

C# (CSharp) TextTokenizer Examples