static void Main(string[] args)
        {
            string data = File.ReadAllText("Data.txt");

            //setup rules that will be applied

            var splitSentences = new SingleRegExpRule(@"(?<=[.!?])\s+(?=[A-Z])", RegExpRuleType.RegExpSplit, "Split sentences");

            var filterQuestions = new SingleRegExpRule(@"[?]$", RegExpRuleType.RegExpFilter, "Filter quesions");

            var filterSentencesWithDate = new MultipleRegExpRules(new List <string>
            {
                @"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}",
                @"(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{4}"
            }, RegExpRuleType.RegExpFilter, RulesConnectionType.Union, "Filter dates");

            //setup rules results logging
            var consoleVisualizer = new RegExpResultConsoleVisualizer();
            EventHandler <RegExpParserResultEventArgs> textParsed = consoleVisualizer.Visualize;
            var textParsedPublisher = new RegExpParserResultPublisher(textParsed);

            //apply rules
            var initData = new RegExpParserResult(data, null);
            var result   = initData.SplitByRegExp(splitSentences, textParsedPublisher)
                           .FilterByRegExp(filterQuestions, textParsedPublisher)
                           .FilterByRegExp(filterSentencesWithDate, textParsedPublisher);


            //Console.WriteLine("INITIAL TEXT:\n{0}", data);
            //var freq = SimpleRexExpForTest.CalcFrequencyOfWordI(data);
            //var sentences = SimpleRexExpForTest.SplitOnSentences(data, true);
            //var questions = SimpleRexExpForTest.FilterQuestions(sentences, true);
            //SimpleRexExpForTest.FilterWithDates(questions, true);
        }
示例#2
0
        public static RegExpParserResult SplitByRegExp(this RegExpParserResult prevParserResult,
                                                       SingleRegExpRule rule, IRegExpParserResultPublisher <RegExpRuleType, RegExpParserResultEventArgs> resultPublisher)
        {
            //check if regular expression type matches operation type
            if (rule.RuleType != RegExpRuleType.RegExpSplit)
            {
                throw new NotSupportedException(
                          String.Format("Method accepts regular expressions that are aimed to split text, but this has type {0}",
                                        rule.RuleType.ToString()));
            }

            //split blank text
            var entryArray = Regex.Split(prevParserResult.SourceText, rule.RegularExpression);

            //identify entries indexes
            var entryIndexDictionary = new List <KeyValuePair <int, string> >();

            foreach (var entry in entryArray)
            {
                int entryIndex = prevParserResult.SourceText.IndexOf(entry);
                entryIndexDictionary.Add(new KeyValuePair <int, string>(entryIndex, entry));
            }

            resultPublisher.RegExpParserResult = new RegExpParserResult(
                (prevParserResult.Entries != null && prevParserResult.Entries.Count() != 0) ?
                String.Join("\n", prevParserResult.Entries.Select(e => e.Value)) : prevParserResult.SourceText,
                entryIndexDictionary, rule.Title, prevParserResult);
            resultPublisher.Publish();
            return((RegExpParserResult)resultPublisher.RegExpParserResult);
        }
        public void SplitByRegExp_WithSingleRegExpRule_Split9Sentences()
        {
            //Arrange
            var splitSentences = new SingleRegExpRule(@"(?<=[.!?])\s+(?=[A-Z])", RegExpRuleType.RegExpSplit, "Split sentences");
            var initData       = new RegExpParserResult(_data, null);

            //Act
            var sentences = initData.SplitByRegExp(splitSentences);

            //Assert
            Assert.IsTrue(sentences.Entries.ToList().Count == 9);
        }
        public void ChainOfSplitByRegExpAndFilterByRegExp_WithSingleRegExpRule_Split9SentencesAdnFiltersQuestions()
        {
            //Arrange
            var splitSentences  = new SingleRegExpRule(@"(?<=[.!?])\s+(?=[A-Z])", RegExpRuleType.RegExpSplit, "Split sentences");
            var filterQuestions = new SingleRegExpRule(@"[?]$", RegExpRuleType.RegExpFilter, "Filter quesions");
            var initData        = new RegExpParserResult(_data, null);

            //Act
            var questions = initData.SplitByRegExp(splitSentences).FilterByRegExp(filterQuestions);

            //Assert
            Assert.IsTrue(questions.Entries.ToList().Count == 3);
        }
        public void ChainOfSplitByRegExpAndFilterByRegExp_WithMultipleRegExpRule_Split9SentencesAdnFiltersSentencesWithDates()
        {
            //Arrange
            var splitSentences          = new SingleRegExpRule(@"(?<=[.!?])\s+(?=[A-Z])", RegExpRuleType.RegExpSplit, "Split sentences");
            var filterSentencesWithDate = new MultipleRegExpRules(new List <string>
            {
                @"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}",
                @"(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{4}"
            }, RegExpRuleType.RegExpSplit, RulesConnectionType.Union, "Filter dates");
            var initData = new RegExpParserResult(_data, null);

            //Act
            var sentencesWithBothFormsOfDates = initData.SplitByRegExp(splitSentences).FilterByRegExp(filterSentencesWithDate);

            //Assert
            Assert.IsTrue(sentencesWithBothFormsOfDates.Entries.ToList().Count == 6);
        }
        public static RegExpParserResult FindByRegExp(this RegExpParserResult prevParserResult,
                                                      RegExpRule rule, IRegExpParserResultPublisher <RegExpRuleType, RegExpParserResultEventArgs> resultPublisher)
        {
            //check if regular expression type matches operation type
            if (rule.RuleType != RegExpRuleType.RegExpFind)
            {
                throw new NotSupportedException(
                          String.Format("Method accepts regular expressions that are aimed to filter text, but this has type {0}",
                                        rule.RuleType.ToString()));
            }

            //check if entires are not null as next calculations are based on it
            if (prevParserResult.Entries == null)
            {
                throw new ArgumentNullException("Something went wrong. Entries of previous action are null.");
            }

            //apply different filtration logic depending on type of rule
            IEnumerable <KeyValuePair <int, string> > entryIndexDictionary = new List <KeyValuePair <int, string> >();

            if (rule is SingleRegExpRule)
            {
                entryIndexDictionary = FindEntriesWithSingleRexExp(prevParserResult.Entries, (rule as SingleRegExpRule));
            }
            else if (rule is MultipleRegExpRules)
            {
                entryIndexDictionary = FindEntriesWithMultipleRegExp(prevParserResult.Entries, (rule as MultipleRegExpRules));
            }
            else
            {
                throw new NotSupportedException("Passed rule type is not supported in this method.");
            }

            resultPublisher.RegExpParserResult = new RegExpParserResult(
                (prevParserResult.Entries != null && prevParserResult.Entries.Count() != 0) ?
                String.Join("\n", prevParserResult.Entries.Select(e => e.Value)) : prevParserResult.SourceText,
                entryIndexDictionary, rule.Title, prevParserResult);
            resultPublisher.Publish();
            return((RegExpParserResult)resultPublisher.RegExpParserResult);
        }