public static async Task <TextCursor> ParseText(StreamText text, TextSpan span, ITextParser parser) { var result = parser.Parse(text, span); if (NeedsMoreInput(text, span, result)) { var streamText = await text.GetNext(span).ConfigureAwait(false); var textSpan = new TextSpan(0, streamText.Length); result = parser.Parse(streamText, textSpan); while (NeedsMoreInput(streamText, textSpan, result)) { streamText = await streamText.Reader.Next(streamText).ConfigureAwait(false); textSpan = new TextSpan(0, streamText.Length); result = parser.Parse(streamText, textSpan); } if (result.HasResult) { return(new StreamTextCursor(streamText, result.Result, result.Next, parser)); } } if (result.HasResult) { return(new StreamTextCursor(text, result.Result, result.Next, parser)); } return(new EmptyTextCursor(text, span)); }
public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span) { var next = span; var parsed = _parser.Parse(text, span); if (_atLeastOne && !parsed.HasResult) { return(new Unmatched <TextSpan, TextSpan>(parsed.Next)); } var matched = parsed.Result.Head; while (parsed.HasResult) { if (next == parsed.Next) { break; } if (!matched.IsAdjacentTo(parsed.Result)) { break; } matched += parsed.Result; next = parsed.Next; parsed = _parser.Parse(text, next); } return(new Success <TextSpan, TextSpan>(matched, next)); }
public Result <TextSpan, IReadOnlyList <TextSpan> > Parse(ParseText text, TextSpan span) { var next = span; var result = _parser.Parse(text, span); if (!result.HasResult) { return(new Unmatched <TextSpan, IReadOnlyList <TextSpan> >(result.Next)); } List <TextSpan> spans = new List <TextSpan>(); while (result.HasResult) { if (next == result.Next) { break; } spans.Add(result.Result); next = result.Next; result = _parser.Parse(text, next); } return(new Success <TextSpan, IReadOnlyList <TextSpan> >(spans, next)); }
bool TryParseUntil(int index, out TSlice result) { while (_parseSpan.Length > 0 && index >= _slices.Count) { Result <TextSpan, TextSpan> parseResult = _parser.Parse(_sourceText, _parseSpan); if (parseResult.HasResult == false) { break; } AddSlice(parseResult.Result); _parseSpan = parseResult.Next; // if the remaining text is empty, and the end of the result is not the start of the next span, // there was a separator or something in between and therefore, we have an empty value next so add it. if (_parseSpan.IsEmpty && _parseSpan.Start > parseResult.Result.End) { AddSlice(_parseSpan); } } if (index < _slices.Count) { result = _slices[index]; return(true); } result = default; return(false); }
public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null) { return(true); } string extension = MapContentTypeToExtension(propertyBag.ContentType); if (extension.IsNullOrEmpty()) { return(true); } propertyBag.Title = propertyBag.Step.Uri.PathAndQuery; using (TempFile temp = new TempFile()) { temp.FileName += "." + extension; using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000)) { await fs.WriteAsync(propertyBag.Response, 0, propertyBag.Response.Length); } ParserContext context = new ParserContext(temp.FileName); ITextParser parser = ParserFactory.CreateText(context); propertyBag.Text = parser.Parse(); } return(true); }
public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span) { if (span.IsEmpty) { return(new Unmatched <TextSpan, TextSpan>(span)); } var next = span; var element = _element.Parse(text, span); if (element.HasResult) { var separator = _separator.Parse(text, element.Next); if (separator.HasResult) { return(new Success <TextSpan, TextSpan>(element.Result, separator.Next)); } if (element.Next == next) { return(new Unmatched <TextSpan, TextSpan>(next)); } return(new Success <TextSpan, TextSpan>(element.Result, element.Next)); } return(new Unmatched <TextSpan, TextSpan>(element.Next)); }
//Parse document(option=['document'->.docx, .pdf] or ['txt'->.txt]) //Removes punctuation and returns the words of the document in an array of strings public string[] GetText(string path, string option) { string text = null; try { ParserContext context = new ParserContext(path); if (option.Equals("txt")) { ITextParser parser = ParserFactory.CreateText(context); text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ') .Replace('\t', ' '); } else if (option.Equals("document")) { IDocumentParser parser = ParserFactory.CreateDocument(context); text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ') .Replace('\t', ' '); } } catch (Exception e) { Console.WriteLine("Exception found"); Console.WriteLine(e.Message); } text = RemovePunctuation(text); string[] words = text.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries); return(words); }
//--------------------------------------------------------------------- /// <summary> /// Loads an instance of T from a file. The file may contain a /// serialized form of an editable instance or it may be a text file /// that needs parsing. /// </summary> public static T Load <T>(string path, ITextParser <T> parser) { if (Path.GetExtension(path) == FileExtension) { // Deserialize an editable instance from the file // Binary serialization: IFormatter formatter = new BinaryFormatter(); Stream stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using (stream) { IEditable <T> editableObject = (IEditable <T>)formatter.Deserialize(stream); if (!editableObject.IsComplete) { throw new System.ApplicationException("Not complete T"); } return(editableObject.GetComplete()); } } else { LineReader reader = OpenTextFile(path); try { return(parser.Parse(reader)); } finally { reader.Close(); } } }
public string ParseFileToString(FileInfo fileInfo) { ParserContext context = new ParserContext(fileInfo.FullName); ITextParser parser = ParserFactory.CreateText(context); string doc = parser.Parse(); return(doc); }
public bool TryCompile(string text) { var parsedText = _parser.Parse(text); if (parsedText != null) { return(TryCompile(parsedText)); } return(false); }
public void TestParseDirectoryFromZip() { ParserContext context = new ParserContext(TestDataSample.GetFilePath("toxy.zip", null)); ITextParser parser = ParserFactory.CreateText(context); string list = parser.Parse(); Assert.IsNotNull(list); string[] lines = list.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(68, lines.Length); }
public void TestExcel2003TextParser() { ParserContext context = new ParserContext(TestDataSample.GetExcelPath("Employee.xls")); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNull(result); Assert.IsTrue(result.IndexOf("Last name") > 0); Assert.IsTrue(result.IndexOf("First name") > 0); }
public void TestReadWholeText() { string path = TestDataSample.GetTextPath("utf8.txt"); ParserContext context = new ParserContext(path); ITextParser parser = ParserFactory.CreateText(context); string text = parser.Parse(); Assert.AreEqual("hello world" + Environment.NewLine + "a2" + Environment.NewLine + "a3" + Environment.NewLine + "bbb4" + Environment.NewLine, text); }
public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span) { var result = _first.Parse(text, span); if (result.HasResult) { return(result); } return(_second.Parse(text, span)); }
public void TestExcel2007TextParserWithHeaderFooter() { ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx")); context.Properties.Add("IncludeHeaderFooter", "1"); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNull(result); Assert.IsTrue(result.IndexOf("This is the header") > 0); }
public void TestExcel2007TextParserWithoutSheetNames() { ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx")); context.Properties.Add("IncludeSheetNames", "0"); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNull(result); Assert.IsTrue(result.IndexOf("Sheet1") < 0); }
public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span) { var parsed = _except.Parse(text, span); if (parsed.HasResult) { return(new Unmatched <TextSpan, TextSpan>(span)); } return(_parser.Parse(text, span)); }
public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span) { var parsed = _parser.Parse(text, span); if (parsed.HasResult) { return(parsed); } return(new Success <TextSpan, TextSpan>(_defaultValue, span)); }
public static Task <TextCursor> ParseText(ParseText text, TextSpan span, ITextParser parser) { var result = parser.Parse(text, span); if (result.HasResult) { return(Task.FromResult <TextCursor>(new StreamTextCursor(new StreamText(text, null), result.Result, result.Next, parser))); } return(Task.FromResult <TextCursor>(new EmptyTextCursor(new StreamText(text, null), span))); }
/// <summary> /// Loads an instance of T from a file. The file may contain a /// serialized form of an editable instance or it may be a text file /// that needs parsing. /// </summary> private T Load <T>(string path, ITextParser <T> parser) { LineReader reader = this.OpenTextFile(path); try { return(parser.Parse(reader)); } finally { reader.Close(); } }
public Result <TextSpan, T> Parse(ParseText text, TextSpan span) { var parsed = _parser.Parse(text, span); if (parsed.HasResult) { if (_converter(text, parsed.Result, out T result)) { return(new Success <TextSpan, T>(result, parsed.Next)); } } return(new Unmatched <TextSpan, T>(parsed.Next)); }
public IActionResult Get(string format, string text) { try { var sentences = _parser.Parse(text); var formatter = MakeTextFormatter(format); return(Ok(formatter.Format(sentences))); } catch (Exception e) { _logger.LogError("Text processing has failed.", e); return(NoContent()); } }
public void TestExcel2007TextParser() { ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx")); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNull(result); Assert.IsTrue(result.IndexOf("Foo") > 0); Assert.IsTrue(result.IndexOf("Bar") > 0); Assert.IsTrue(result.IndexOf("a really long cell") > 0); Assert.IsTrue(result.IndexOf("have a header") > 0); Assert.IsTrue(result.IndexOf("have a footer") > 0); Assert.IsTrue(result.IndexOf("This is the header") < 0); }
public void ReadTextBasicTest() { string path = Path.GetFullPath(TestDataSample.GetPowerpointPath("testPPT.pptx")); ParserContext context = new ParserContext(path); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNullOrEmpty(result); string[] texts = result.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(14, texts.Length); Assert.AreEqual("Attachment Test", texts[0]); Assert.AreEqual("Rajiv", texts[1]); Assert.AreEqual("Different words to test against", texts[4]); Assert.AreEqual("Hello", texts[7]); }
public void Should_match_up_to_the_except_match_for_one_using_LINQ() { string subject = "abcd"; ITextParser charParser = Parser.Factory.CreateText(x => from prefix in x.Char.Except(x.Char('b')).ZeroOrMore() select prefix ); Result <TextSpan, TextSpan> result = charParser.Parse(subject); Assert.IsTrue(result.HasResult); Assert.That(result.Result.Length, Is.EqualTo(1)); Assert.That(subject.Substring(result.Result.Start, result.Result.Length), Is.EqualTo("a")); }
public string ExtractText(string filePath, string extension) { ParserContext c = new ParserContext(filePath); ITextParser parser = ParserFactory.CreateText(c); string text = parser.Parse(); foreach (var t in text) { if (char.IsControl(t) && t != '\n' && t != '\t' && t != '\r') { Console.Error.WriteLine("Found control character: {0} {1}", (int)t, t); return(null); } } return(text); }
public void TestParseTextFromWord() { ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.docx")); ITextParser parser = ParserFactory.CreateText(context); string doc = parser.Parse(); Assert.IsNotNull(doc); string[] lines = doc.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(6, lines.Length); Assert.AreEqual("I am a test document", lines[0]); Assert.AreEqual("This is page 1", lines[1]); Assert.AreEqual("I am Calibri (Body) in font size 11", lines[2]); Assert.AreEqual("This is page two", lines[3]); Assert.AreEqual("It’s Arial Black in 16 point", lines[4]); Assert.AreEqual("It’s also in blue", lines[5]); }
public void Run() { var options = new Options(); CommandLine.Parser.Default.ParseArguments(Args, options); var text = reader.Read(options.InputFile); var words = parser.Parse(text); var useFilters = GetFiltersNames(); var useConverters = GetConvertorsNames(); cloudCreator.Create(words, options.MaxFontSize, options.MinFontSize, options.WordsCount, options.Font, useFilters, useConverters); var bitmap = visualizer.Vizualize(cloudCreator.RectanglesCloud, Color.AliceBlue); saver.Save(bitmap, options.OutputFile); Console.WriteLine($@"Image saved to {options.OutputFile}"); Console.ReadKey(); }
public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span) { var next = span; var matched = next.Head; int patternIndex; for (patternIndex = 0; patternIndex < _chars.Length; patternIndex++) { var parsed = _parser.Parse(text, next); if (parsed.HasResult) { if (next == parsed.Next) { break; } var result = parsed.Result; for (int sourceIndex = result.Start; sourceIndex < parsed.Result.End; sourceIndex++, patternIndex++) { if (!matched.IsAdjacentTo(result)) { break; } if (_chars[patternIndex] != text[sourceIndex]) { return(new Unmatched <TextSpan, TextSpan>(TextSpan.FromBounds(sourceIndex, result.End))); } matched += result; } next = parsed.Next; } } if (matched.Length == _chars.Length) { return(new Success <TextSpan, TextSpan>(matched, next)); } return(new Unmatched <TextSpan, TextSpan>(next)); }
public void TestParseTextFromWord() { ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.doc")); ITextParser parser = ParserFactory.CreateText(context); string doc = parser.Parse(); Assert.IsNotNull(doc); string[] lines = doc.Split('\r'); Assert.AreEqual(8, lines.Length); Assert.AreEqual("I am a test document", lines[0]); Assert.AreEqual("This is page 1", lines[1]); Assert.AreEqual("I am Calibri (Body) in font size 11", lines[2]); Assert.AreEqual("\f", lines[3]); Assert.AreEqual("This is page two", lines[4]); Assert.AreEqual("It’s Arial Black in 16 point", lines[5]); Assert.AreEqual("It’s also in blue", lines[6]); Assert.AreEqual("", lines[7]); }