static void Main(string[] args) { string data = File.ReadAllText("Data.txt"); //setup rules that will be applied var splitSentences = new SingleRegExpRule(@"(?<=[.!?])\s+(?=[A-Z])", RegExpRuleType.RegExpSplit, "Split sentences"); var filterQuestions = new SingleRegExpRule(@"[?]$", RegExpRuleType.RegExpFilter, "Filter quesions"); var filterSentencesWithDate = new MultipleRegExpRules(new List <string> { @"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}", @"(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{4}" }, RegExpRuleType.RegExpFilter, RulesConnectionType.Union, "Filter dates"); //setup rules results logging var consoleVisualizer = new RegExpResultConsoleVisualizer(); EventHandler <RegExpParserResultEventArgs> textParsed = consoleVisualizer.Visualize; var textParsedPublisher = new RegExpParserResultPublisher(textParsed); //apply rules var initData = new RegExpParserResult(data, null); var result = initData.SplitByRegExp(splitSentences, textParsedPublisher) .FilterByRegExp(filterQuestions, textParsedPublisher) .FilterByRegExp(filterSentencesWithDate, textParsedPublisher); //Console.WriteLine("INITIAL TEXT:\n{0}", data); //var freq = SimpleRexExpForTest.CalcFrequencyOfWordI(data); //var sentences = SimpleRexExpForTest.SplitOnSentences(data, true); //var questions = SimpleRexExpForTest.FilterQuestions(sentences, true); //SimpleRexExpForTest.FilterWithDates(questions, true); }
public static RegExpParserResult SplitByRegExp(this RegExpParserResult prevParserResult, SingleRegExpRule rule, IRegExpParserResultPublisher <RegExpRuleType, RegExpParserResultEventArgs> resultPublisher) { //check if regular expression type matches operation type if (rule.RuleType != RegExpRuleType.RegExpSplit) { throw new NotSupportedException( String.Format("Method accepts regular expressions that are aimed to split text, but this has type {0}", rule.RuleType.ToString())); } //split blank text var entryArray = Regex.Split(prevParserResult.SourceText, rule.RegularExpression); //identify entries indexes var entryIndexDictionary = new List <KeyValuePair <int, string> >(); foreach (var entry in entryArray) { int entryIndex = prevParserResult.SourceText.IndexOf(entry); entryIndexDictionary.Add(new KeyValuePair <int, string>(entryIndex, entry)); } resultPublisher.RegExpParserResult = new RegExpParserResult( (prevParserResult.Entries != null && prevParserResult.Entries.Count() != 0) ? String.Join("\n", prevParserResult.Entries.Select(e => e.Value)) : prevParserResult.SourceText, entryIndexDictionary, rule.Title, prevParserResult); resultPublisher.Publish(); return((RegExpParserResult)resultPublisher.RegExpParserResult); }
public void SplitByRegExp_WithSingleRegExpRule_Split9Sentences() { //Arrange var splitSentences = new SingleRegExpRule(@"(?<=[.!?])\s+(?=[A-Z])", RegExpRuleType.RegExpSplit, "Split sentences"); var initData = new RegExpParserResult(_data, null); //Act var sentences = initData.SplitByRegExp(splitSentences); //Assert Assert.IsTrue(sentences.Entries.ToList().Count == 9); }
public void ChainOfSplitByRegExpAndFilterByRegExp_WithSingleRegExpRule_Split9SentencesAdnFiltersQuestions() { //Arrange var splitSentences = new SingleRegExpRule(@"(?<=[.!?])\s+(?=[A-Z])", RegExpRuleType.RegExpSplit, "Split sentences"); var filterQuestions = new SingleRegExpRule(@"[?]$", RegExpRuleType.RegExpFilter, "Filter quesions"); var initData = new RegExpParserResult(_data, null); //Act var questions = initData.SplitByRegExp(splitSentences).FilterByRegExp(filterQuestions); //Assert Assert.IsTrue(questions.Entries.ToList().Count == 3); }
public void ChainOfSplitByRegExpAndFilterByRegExp_WithMultipleRegExpRule_Split9SentencesAdnFiltersSentencesWithDates() { //Arrange var splitSentences = new SingleRegExpRule(@"(?<=[.!?])\s+(?=[A-Z])", RegExpRuleType.RegExpSplit, "Split sentences"); var filterSentencesWithDate = new MultipleRegExpRules(new List <string> { @"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}", @"(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{4}" }, RegExpRuleType.RegExpSplit, RulesConnectionType.Union, "Filter dates"); var initData = new RegExpParserResult(_data, null); //Act var sentencesWithBothFormsOfDates = initData.SplitByRegExp(splitSentences).FilterByRegExp(filterSentencesWithDate); //Assert Assert.IsTrue(sentencesWithBothFormsOfDates.Entries.ToList().Count == 6); }
public static RegExpParserResult FindByRegExp(this RegExpParserResult prevParserResult, RegExpRule rule, IRegExpParserResultPublisher <RegExpRuleType, RegExpParserResultEventArgs> resultPublisher) { //check if regular expression type matches operation type if (rule.RuleType != RegExpRuleType.RegExpFind) { throw new NotSupportedException( String.Format("Method accepts regular expressions that are aimed to filter text, but this has type {0}", rule.RuleType.ToString())); } //check if entires are not null as next calculations are based on it if (prevParserResult.Entries == null) { throw new ArgumentNullException("Something went wrong. Entries of previous action are null."); } //apply different filtration logic depending on type of rule IEnumerable <KeyValuePair <int, string> > entryIndexDictionary = new List <KeyValuePair <int, string> >(); if (rule is SingleRegExpRule) { entryIndexDictionary = FindEntriesWithSingleRexExp(prevParserResult.Entries, (rule as SingleRegExpRule)); } else if (rule is MultipleRegExpRules) { entryIndexDictionary = FindEntriesWithMultipleRegExp(prevParserResult.Entries, (rule as MultipleRegExpRules)); } else { throw new NotSupportedException("Passed rule type is not supported in this method."); } resultPublisher.RegExpParserResult = new RegExpParserResult( (prevParserResult.Entries != null && prevParserResult.Entries.Count() != 0) ? String.Join("\n", prevParserResult.Entries.Select(e => e.Value)) : prevParserResult.SourceText, entryIndexDictionary, rule.Title, prevParserResult); resultPublisher.Publish(); return((RegExpParserResult)resultPublisher.RegExpParserResult); }