Пример #1
0
        public static async Task <TextCursor> ParseText(StreamText text, TextSpan span, ITextParser parser)
        {
            var result = parser.Parse(text, span);

            if (NeedsMoreInput(text, span, result))
            {
                var streamText = await text.GetNext(span).ConfigureAwait(false);

                var textSpan = new TextSpan(0, streamText.Length);

                result = parser.Parse(streamText, textSpan);
                while (NeedsMoreInput(streamText, textSpan, result))
                {
                    streamText = await streamText.Reader.Next(streamText).ConfigureAwait(false);

                    textSpan = new TextSpan(0, streamText.Length);

                    result = parser.Parse(streamText, textSpan);
                }

                if (result.HasResult)
                {
                    return(new StreamTextCursor(streamText, result.Result, result.Next, parser));
                }
            }

            if (result.HasResult)
            {
                return(new StreamTextCursor(text, result.Result, result.Next, parser));
            }

            return(new EmptyTextCursor(text, span));
        }
Пример #2
0
        public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span)
        {
            var next = span;

            var parsed = _parser.Parse(text, span);

            if (_atLeastOne && !parsed.HasResult)
            {
                return(new Unmatched <TextSpan, TextSpan>(parsed.Next));
            }

            var matched = parsed.Result.Head;

            while (parsed.HasResult)
            {
                if (next == parsed.Next)
                {
                    break;
                }

                if (!matched.IsAdjacentTo(parsed.Result))
                {
                    break;
                }

                matched += parsed.Result;

                next = parsed.Next;

                parsed = _parser.Parse(text, next);
            }

            return(new Success <TextSpan, TextSpan>(matched, next));
        }
Пример #3
0
        public Result <TextSpan, IReadOnlyList <TextSpan> > Parse(ParseText text, TextSpan span)
        {
            var next = span;

            var result = _parser.Parse(text, span);

            if (!result.HasResult)
            {
                return(new Unmatched <TextSpan, IReadOnlyList <TextSpan> >(result.Next));
            }

            List <TextSpan> spans = new List <TextSpan>();

            while (result.HasResult)
            {
                if (next == result.Next)
                {
                    break;
                }

                spans.Add(result.Result);

                next = result.Next;

                result = _parser.Parse(text, next);
            }

            return(new Success <TextSpan, IReadOnlyList <TextSpan> >(spans, next));
        }
Пример #4
0
        bool TryParseUntil(int index, out TSlice result)
        {
            while (_parseSpan.Length > 0 && index >= _slices.Count)
            {
                Result <TextSpan, TextSpan> parseResult = _parser.Parse(_sourceText, _parseSpan);
                if (parseResult.HasResult == false)
                {
                    break;
                }

                AddSlice(parseResult.Result);

                _parseSpan = parseResult.Next;

                // if the remaining text is empty, and the end of the result is not the start of the next span,
                // there was a separator or something in between and therefore, we have an empty value next so add it.
                if (_parseSpan.IsEmpty && _parseSpan.Start > parseResult.Result.End)
                {
                    AddSlice(_parseSpan);
                }
            }

            if (index < _slices.Count)
            {
                result = _slices[index];
                return(true);
            }

            result = default;
            return(false);
        }
        public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK ||
                propertyBag.Response == null)
            {
                return(true);
            }

            string extension = MapContentTypeToExtension(propertyBag.ContentType);

            if (extension.IsNullOrEmpty())
            {
                return(true);
            }

            propertyBag.Title = propertyBag.Step.Uri.PathAndQuery;
            using (TempFile temp = new TempFile())
            {
                temp.FileName += "." + extension;
                using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
                {
                    await fs.WriteAsync(propertyBag.Response, 0, propertyBag.Response.Length);
                }

                ParserContext context = new ParserContext(temp.FileName);
                ITextParser   parser  = ParserFactory.CreateText(context);
                propertyBag.Text = parser.Parse();
            }

            return(true);
        }
Пример #6
0
        public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span)
        {
            if (span.IsEmpty)
            {
                return(new Unmatched <TextSpan, TextSpan>(span));
            }

            var next = span;

            var element = _element.Parse(text, span);

            if (element.HasResult)
            {
                var separator = _separator.Parse(text, element.Next);
                if (separator.HasResult)
                {
                    return(new Success <TextSpan, TextSpan>(element.Result, separator.Next));
                }

                if (element.Next == next)
                {
                    return(new Unmatched <TextSpan, TextSpan>(next));
                }

                return(new Success <TextSpan, TextSpan>(element.Result, element.Next));
            }

            return(new Unmatched <TextSpan, TextSpan>(element.Next));
        }
Пример #7
0
        //Parse document(option=['document'->.docx, .pdf] or ['txt'->.txt])
        //Removes punctuation and returns the words of the document in an array of strings
        public string[] GetText(string path, string option)
        {
            string text = null;

            try
            {
                ParserContext context = new ParserContext(path);
                if (option.Equals("txt"))
                {
                    ITextParser parser = ParserFactory.CreateText(context);
                    text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ')
                           .Replace('\t', ' ');
                }
                else if (option.Equals("document"))
                {
                    IDocumentParser parser = ParserFactory.CreateDocument(context);
                    text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ')
                           .Replace('\t', ' ');
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception found");
                Console.WriteLine(e.Message);
            }
            text = RemovePunctuation(text);
            string[] words = text.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries);
            return(words);
        }
Пример #8
0
        //---------------------------------------------------------------------

        /// <summary>
        /// Loads an instance of T from a file.  The file may contain a
        /// serialized form of an editable instance or it may be a text file
        /// that needs parsing.
        /// </summary>
        public static T Load <T>(string path,
                                 ITextParser <T> parser)
        {
            if (Path.GetExtension(path) == FileExtension)
            {
                //  Deserialize an editable instance from the file
                //  Binary serialization:
                IFormatter formatter = new BinaryFormatter();
                Stream     stream    = new FileStream(path, FileMode.Open,
                                                      FileAccess.Read, FileShare.Read);
                using (stream) {
                    IEditable <T> editableObject = (IEditable <T>)formatter.Deserialize(stream);
                    if (!editableObject.IsComplete)
                    {
                        throw new System.ApplicationException("Not complete T");
                    }
                    return(editableObject.GetComplete());
                }
            }
            else
            {
                LineReader reader = OpenTextFile(path);
                try {
                    return(parser.Parse(reader));
                }
                finally {
                    reader.Close();
                }
            }
        }
Пример #9
0
        public string ParseFileToString(FileInfo fileInfo)
        {
            ParserContext context = new ParserContext(fileInfo.FullName);
            ITextParser   parser  = ParserFactory.CreateText(context);

            string doc = parser.Parse();

            return(doc);
        }
Пример #10
0
        public bool TryCompile(string text)
        {
            var parsedText = _parser.Parse(text);

            if (parsedText != null)
            {
                return(TryCompile(parsedText));
            }
            return(false);
        }
Пример #11
0
        public void TestParseDirectoryFromZip()
        {
            ParserContext context = new ParserContext(TestDataSample.GetFilePath("toxy.zip", null));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        list    = parser.Parse();

            Assert.IsNotNull(list);
            string[] lines = list.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
            Assert.AreEqual(68, lines.Length);
        }
Пример #12
0
        public void TestExcel2003TextParser()
        {
            ParserContext context = new ParserContext(TestDataSample.GetExcelPath("Employee.xls"));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        result  = parser.Parse();

            Assert.IsNotNull(result);
            Assert.IsTrue(result.IndexOf("Last name") > 0);
            Assert.IsTrue(result.IndexOf("First name") > 0);
        }
Пример #13
0
        public void TestReadWholeText()
        {
            string path = TestDataSample.GetTextPath("utf8.txt");

            ParserContext context = new ParserContext(path);
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        text    = parser.Parse();

            Assert.AreEqual("hello world" + Environment.NewLine + "a2" + Environment.NewLine + "a3" + Environment.NewLine + "bbb4" + Environment.NewLine, text);
        }
Пример #14
0
        public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span)
        {
            var result = _first.Parse(text, span);

            if (result.HasResult)
            {
                return(result);
            }

            return(_second.Parse(text, span));
        }
Пример #15
0
        public void TestExcel2007TextParserWithHeaderFooter()
        {
            ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx"));

            context.Properties.Add("IncludeHeaderFooter", "1");
            ITextParser parser = ParserFactory.CreateText(context);
            string      result = parser.Parse();

            Assert.IsNotNull(result);
            Assert.IsTrue(result.IndexOf("This is the header") > 0);
        }
Пример #16
0
        public void TestExcel2007TextParserWithoutSheetNames()
        {
            ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx"));

            context.Properties.Add("IncludeSheetNames", "0");
            ITextParser parser = ParserFactory.CreateText(context);
            string      result = parser.Parse();

            Assert.IsNotNull(result);
            Assert.IsTrue(result.IndexOf("Sheet1") < 0);
        }
Пример #17
0
        public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span)
        {
            var parsed = _except.Parse(text, span);

            if (parsed.HasResult)
            {
                return(new Unmatched <TextSpan, TextSpan>(span));
            }

            return(_parser.Parse(text, span));
        }
Пример #18
0
        public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span)
        {
            var parsed = _parser.Parse(text, span);

            if (parsed.HasResult)
            {
                return(parsed);
            }

            return(new Success <TextSpan, TextSpan>(_defaultValue, span));
        }
Пример #19
0
        public static Task <TextCursor> ParseText(ParseText text, TextSpan span, ITextParser parser)
        {
            var result = parser.Parse(text, span);

            if (result.HasResult)
            {
                return(Task.FromResult <TextCursor>(new StreamTextCursor(new StreamText(text, null), result.Result, result.Next, parser)));
            }

            return(Task.FromResult <TextCursor>(new EmptyTextCursor(new StreamText(text, null), span)));
        }
Пример #20
0
        /// <summary>
        /// Loads an instance of T from a file.  The file may contain a
        /// serialized form of an editable instance or it may be a text file
        /// that needs parsing.
        /// </summary>
        private T Load <T>(string path,
                           ITextParser <T> parser)
        {
            LineReader reader = this.OpenTextFile(path);

            try
            {
                return(parser.Parse(reader));
            }
            finally
            {
                reader.Close();
            }
        }
Пример #21
0
        public Result <TextSpan, T> Parse(ParseText text, TextSpan span)
        {
            var parsed = _parser.Parse(text, span);

            if (parsed.HasResult)
            {
                if (_converter(text, parsed.Result, out T result))
                {
                    return(new Success <TextSpan, T>(result, parsed.Next));
                }
            }

            return(new Unmatched <TextSpan, T>(parsed.Next));
        }
Пример #22
0
 public IActionResult Get(string format, string text)
 {
     try
     {
         var sentences = _parser.Parse(text);
         var formatter = MakeTextFormatter(format);
         return(Ok(formatter.Format(sentences)));
     }
     catch (Exception e)
     {
         _logger.LogError("Text processing has failed.", e);
         return(NoContent());
     }
 }
Пример #23
0
        public void TestExcel2007TextParser()
        {
            ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx"));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        result  = parser.Parse();

            Assert.IsNotNull(result);
            Assert.IsTrue(result.IndexOf("Foo") > 0);
            Assert.IsTrue(result.IndexOf("Bar") > 0);
            Assert.IsTrue(result.IndexOf("a really long cell") > 0);

            Assert.IsTrue(result.IndexOf("have a header") > 0);
            Assert.IsTrue(result.IndexOf("have a footer") > 0);
            Assert.IsTrue(result.IndexOf("This is the header") < 0);
        }
        public void ReadTextBasicTest()
        {
            string        path    = Path.GetFullPath(TestDataSample.GetPowerpointPath("testPPT.pptx"));
            ParserContext context = new ParserContext(path);
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        result  = parser.Parse();

            Assert.IsNotNullOrEmpty(result);
            string[] texts = result.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
            Assert.AreEqual(14, texts.Length);
            Assert.AreEqual("Attachment Test", texts[0]);
            Assert.AreEqual("Rajiv", texts[1]);
            Assert.AreEqual("Different words to test against", texts[4]);
            Assert.AreEqual("Hello", texts[7]);
        }
Пример #25
0
        public void Should_match_up_to_the_except_match_for_one_using_LINQ()
        {
            string subject = "abcd";

            ITextParser charParser = Parser.Factory.CreateText(x =>
                                                               from prefix in x.Char.Except(x.Char('b')).ZeroOrMore()
                                                               select prefix
                                                               );

            Result <TextSpan, TextSpan> result = charParser.Parse(subject);

            Assert.IsTrue(result.HasResult);
            Assert.That(result.Result.Length, Is.EqualTo(1));
            Assert.That(subject.Substring(result.Result.Start, result.Result.Length), Is.EqualTo("a"));
        }
Пример #26
0
        public string ExtractText(string filePath, string extension)
        {
            ParserContext c      = new ParserContext(filePath);
            ITextParser   parser = ParserFactory.CreateText(c);
            string        text   = parser.Parse();

            foreach (var t in text)
            {
                if (char.IsControl(t) && t != '\n' && t != '\t' && t != '\r')
                {
                    Console.Error.WriteLine("Found control character: {0} {1}", (int)t, t);
                    return(null);
                }
            }
            return(text);
        }
Пример #27
0
        public void TestParseTextFromWord()
        {
            ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.docx"));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        doc     = parser.Parse();

            Assert.IsNotNull(doc);

            string[] lines = doc.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
            Assert.AreEqual(6, lines.Length);
            Assert.AreEqual("I am a test document", lines[0]);
            Assert.AreEqual("This is page 1", lines[1]);
            Assert.AreEqual("I am Calibri (Body) in font size 11", lines[2]);
            Assert.AreEqual("This is page two", lines[3]);
            Assert.AreEqual("It’s Arial Black in 16 point", lines[4]);
            Assert.AreEqual("It’s also in blue", lines[5]);
        }
Пример #28
0
        public void Run()
        {
            var options = new Options();

            CommandLine.Parser.Default.ParseArguments(Args, options);
            var text          = reader.Read(options.InputFile);
            var words         = parser.Parse(text);
            var useFilters    = GetFiltersNames();
            var useConverters = GetConvertorsNames();

            cloudCreator.Create(words, options.MaxFontSize, options.MinFontSize, options.WordsCount, options.Font, useFilters, useConverters);
            var bitmap = visualizer.Vizualize(cloudCreator.RectanglesCloud, Color.AliceBlue);

            saver.Save(bitmap, options.OutputFile);
            Console.WriteLine($@"Image saved to {options.OutputFile}");
            Console.ReadKey();
        }
Пример #29
0
        public Result <TextSpan, TextSpan> Parse(ParseText text, TextSpan span)
        {
            var next = span;

            var matched = next.Head;
            int patternIndex;

            for (patternIndex = 0; patternIndex < _chars.Length; patternIndex++)
            {
                var parsed = _parser.Parse(text, next);
                if (parsed.HasResult)
                {
                    if (next == parsed.Next)
                    {
                        break;
                    }

                    var result = parsed.Result;

                    for (int sourceIndex = result.Start; sourceIndex < parsed.Result.End; sourceIndex++, patternIndex++)
                    {
                        if (!matched.IsAdjacentTo(result))
                        {
                            break;
                        }

                        if (_chars[patternIndex] != text[sourceIndex])
                        {
                            return(new Unmatched <TextSpan, TextSpan>(TextSpan.FromBounds(sourceIndex, result.End)));
                        }

                        matched += result;
                    }

                    next = parsed.Next;
                }
            }

            if (matched.Length == _chars.Length)
            {
                return(new Success <TextSpan, TextSpan>(matched, next));
            }

            return(new Unmatched <TextSpan, TextSpan>(next));
        }
Пример #30
0
        public void TestParseTextFromWord()
        {
            ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.doc"));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        doc     = parser.Parse();

            Assert.IsNotNull(doc);

            string[] lines = doc.Split('\r');
            Assert.AreEqual(8, lines.Length);
            Assert.AreEqual("I am a test document", lines[0]);
            Assert.AreEqual("This is page 1", lines[1]);
            Assert.AreEqual("I am Calibri (Body) in font size 11", lines[2]);
            Assert.AreEqual("\f", lines[3]);
            Assert.AreEqual("This is page two", lines[4]);
            Assert.AreEqual("It’s Arial Black in 16 point", lines[5]);
            Assert.AreEqual("It’s also in blue", lines[6]);
            Assert.AreEqual("", lines[7]);
        }