public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK ||
                propertyBag.Response == null)
            {
                return(true);
            }

            string extension = MapContentTypeToExtension(propertyBag.ContentType);

            if (extension.IsNullOrEmpty())
            {
                return(true);
            }

            propertyBag.Title = propertyBag.Step.Uri.PathAndQuery;
            using (TempFile temp = new TempFile())
            {
                temp.FileName += "." + extension;
                using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
                {
                    await fs.WriteAsync(propertyBag.Response, 0, propertyBag.Response.Length);
                }

                ParserContext context = new ParserContext(temp.FileName);
                ITextParser   parser  = ParserFactory.CreateText(context);
                propertyBag.Text = parser.Parse();
            }

            return(true);
        }
示例#2
0
        public void TestParseLineEvent()
        {
            string          path    = TestDataSample.GetTextPath("utf8.txt");
            ParserContext   context = new ParserContext(path);
            PlainTextParser parser  = (PlainTextParser)ParserFactory.CreateText(context);

            parser.ParseLine += (sender, args) =>
            {
                if (args.LineNumber == 0)
                {
                    Assert.AreEqual("hello world", args.Text);
                }
                else if (args.LineNumber == 1)
                {
                    Assert.AreEqual("a2", args.Text);
                }
                else if (args.LineNumber == 2)
                {
                    Assert.AreEqual("a3", args.Text);
                }
                else if (args.LineNumber == 3)
                {
                    Assert.AreEqual("bbb4", args.Text);
                }
            };
            string text = parser.Parse();

            Assert.IsNotNullOrEmpty(text);
        }
示例#3
0
        //Parse document(option=['document'->.docx, .pdf] or ['txt'->.txt])
        //Removes punctuation and returns the words of the document in an array of strings
        public string[] GetText(string path, string option)
        {
            string text = null;

            try
            {
                ParserContext context = new ParserContext(path);
                if (option.Equals("txt"))
                {
                    ITextParser parser = ParserFactory.CreateText(context);
                    text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ')
                           .Replace('\t', ' ');
                }
                else if (option.Equals("document"))
                {
                    IDocumentParser parser = ParserFactory.CreateDocument(context);
                    text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ')
                           .Replace('\t', ' ');
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception found");
                Console.WriteLine(e.Message);
            }
            text = RemovePunctuation(text);
            string[] words = text.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries);
            return(words);
        }
示例#4
0
        public string ParseFileToString(FileInfo fileInfo)
        {
            ParserContext context = new ParserContext(fileInfo.FullName);
            ITextParser   parser  = ParserFactory.CreateText(context);

            string doc = parser.Parse();

            return(doc);
        }
示例#5
0
        public void PureTextMsg_ReadTextTest()
        {
            string        path    = TestDataSample.GetEmailPath("raw text mail demo.msg");
            ParserContext context = new ParserContext(path);
            var           parser  = ParserFactory.CreateText(context);

            string result = parser.Parse();

            Assert.IsNotNullOrEmpty(result);
        }
示例#6
0
        public void TestExcel2003TextParser()
        {
            ParserContext context = new ParserContext(TestDataSample.GetExcelPath("Employee.xls"));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        result  = parser.Parse();

            Assert.IsNotNull(result);
            Assert.IsTrue(result.IndexOf("Last name") > 0);
            Assert.IsTrue(result.IndexOf("First name") > 0);
        }
示例#7
0
        public void TestReadWholeText()
        {
            string path = TestDataSample.GetTextPath("utf8.txt");

            ParserContext context = new ParserContext(path);
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        text    = parser.Parse();

            Assert.AreEqual("hello world" + Environment.NewLine + "a2" + Environment.NewLine + "a3" + Environment.NewLine + "bbb4" + Environment.NewLine, text);
        }
示例#8
0
        public void TestParseDirectoryFromZip()
        {
            ParserContext context = new ParserContext(TestDataSample.GetFilePath("toxy.zip", null));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        list    = parser.Parse();

            Assert.IsNotNull(list);
            string[] lines = list.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
            Assert.AreEqual(68, lines.Length);
        }
示例#9
0
        public void TestExcel2007TextParserWithoutSheetNames()
        {
            ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx"));

            context.Properties.Add("IncludeSheetNames", "0");
            ITextParser parser = ParserFactory.CreateText(context);
            string      result = parser.Parse();

            Assert.IsNotNull(result);
            Assert.IsTrue(result.IndexOf("Sheet1") < 0);
        }
示例#10
0
        public void TestExcel2007TextParserWithHeaderFooter()
        {
            ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx"));

            context.Properties.Add("IncludeHeaderFooter", "1");
            ITextParser parser = ParserFactory.CreateText(context);
            string      result = parser.Parse();

            Assert.IsNotNull(result);
            Assert.IsTrue(result.IndexOf("This is the header") > 0);
        }
        public void ReadTextBasicTest()
        {
            string        path    = Path.GetFullPath(TestDataSample.GetPowerpointPath("testPPT.pptx"));
            ParserContext context = new ParserContext(path);
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        result  = parser.Parse();

            Assert.IsNotNullOrEmpty(result);
            string[] texts = result.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
            Assert.AreEqual(14, texts.Length);
            Assert.AreEqual("Attachment Test", texts[0]);
            Assert.AreEqual("Rajiv", texts[1]);
            Assert.AreEqual("Different words to test against", texts[4]);
            Assert.AreEqual("Hello", texts[7]);
        }
示例#12
0
        public void TestExcel2007TextParser()
        {
            ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx"));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        result  = parser.Parse();

            Assert.IsNotNull(result);
            Assert.IsTrue(result.IndexOf("Foo") > 0);
            Assert.IsTrue(result.IndexOf("Bar") > 0);
            Assert.IsTrue(result.IndexOf("a really long cell") > 0);

            Assert.IsTrue(result.IndexOf("have a header") > 0);
            Assert.IsTrue(result.IndexOf("have a footer") > 0);
            Assert.IsTrue(result.IndexOf("This is the header") < 0);
        }
示例#13
0
        public void HtmlMsg_ReadTextTest()
        {
            string        path    = TestDataSample.GetEmailPath("Azure pricing and services updates.msg");
            ParserContext context = new ParserContext(path);
            var           parser  = ParserFactory.CreateText(context);

            string result = parser.Parse();

            Assert.IsNotNullOrEmpty(result);
            Assert.IsTrue(result.IndexOf("[From] Azure Team<*****@*****.**>") >= 0);
            Assert.IsTrue(result.IndexOf("[To] [email protected]") > 0);
            Assert.IsTrue(result.IndexOf("[Subject] Azure pricing and services updates") > 0);
            Assert.IsFalse(result.IndexOf("[Cc]") > 0);
            Assert.IsFalse(result.IndexOf("[Bcc]") > 0);
        }
示例#14
0
        public string ExtractText(string filePath, string extension)
        {
            ParserContext c      = new ParserContext(filePath);
            ITextParser   parser = ParserFactory.CreateText(c);
            string        text   = parser.Parse();

            foreach (var t in text)
            {
                if (char.IsControl(t) && t != '\n' && t != '\t' && t != '\r')
                {
                    Console.Error.WriteLine("Found control character: {0} {1}", (int)t, t);
                    return(null);
                }
            }
            return(text);
        }
示例#15
0
        public void TestParseTextFromWord()
        {
            ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.docx"));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        doc     = parser.Parse();

            Assert.IsNotNull(doc);

            string[] lines = doc.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
            Assert.AreEqual(6, lines.Length);
            Assert.AreEqual("I am a test document", lines[0]);
            Assert.AreEqual("This is page 1", lines[1]);
            Assert.AreEqual("I am Calibri (Body) in font size 11", lines[2]);
            Assert.AreEqual("This is page two", lines[3]);
            Assert.AreEqual("It’s Arial Black in 16 point", lines[4]);
            Assert.AreEqual("It’s also in blue", lines[5]);
        }
示例#16
0
        public void TestParseTextFromWord()
        {
            ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.doc"));
            ITextParser   parser  = ParserFactory.CreateText(context);
            string        doc     = parser.Parse();

            Assert.IsNotNull(doc);

            string[] lines = doc.Split('\r');
            Assert.AreEqual(8, lines.Length);
            Assert.AreEqual("I am a test document", lines[0]);
            Assert.AreEqual("This is page 1", lines[1]);
            Assert.AreEqual("I am Calibri (Body) in font size 11", lines[2]);
            Assert.AreEqual("\f", lines[3]);
            Assert.AreEqual("This is page two", lines[4]);
            Assert.AreEqual("It’s Arial Black in 16 point", lines[5]);
            Assert.AreEqual("It’s also in blue", lines[6]);
            Assert.AreEqual("", lines[7]);
        }
示例#17
0
        static void Main()
        {
            string PostJson = "";


            ParserContext context       = new ParserContext(@"D:\SelfStudy\PDF1\FAQ_Short.pdf");
            ITextParser   extractParser = ParserFactory.CreateText(context);
            string        extractedText = extractParser.Parse();


            //改行を削除  http://www.atmarkit.co.jp/ait/articles/1004/08/news094.html
            extractedText = extractedText.Replace("\r", "").Replace("\n", "");


            // ., ? を基準に改行
            extractedText = extractedText.Replace(".", ".\r\n");
            extractedText = extractedText.Replace("?", "?\r\n");

            //空白行を削除(.文字だけの行) http://baba-s.hatenablog.com/entry/2018/05/18/171500
            extractedText = Regex.Replace
                            (
                extractedText,
                "^.[\r\n]+",
                string.Empty,
                RegexOptions.Multiline
                            );

            //text書き出し: テスト時無効に

            /***
             * StreamWriter sw_out = new StreamWriter(@"D:\SelfStudy\PDF1\UFR.txt", true, Encoding.GetEncoding("shift_jis"));
             * sw_out.Write(extractedText);
             * sw_out.Close();
             ***/



            //JsonFile作成
            MakeJSON(ref PostJson); //参照渡し:https://dobon.net/vb/dotnet/beginner/byvalbyref.html
            MakeRequest(PostJson);
        }
        public async Task <string> Extract(string fileName, IContent content, CancellationToken cancellationToken)
        {
            await using var stream = await content.OpenReadStream(cancellationToken);

            try
            {
                var path       = Path.Combine(Path.GetTempPath(), fileName);
                var toxyParser = ParserFactory.CreateText(new ParserContext(path));
                await using (var fileStream = File.Create(path, 81920, FileOptions.Asynchronous))
                {
                    await stream.CopyToAsync(fileStream, cancellationToken);
                }

                var text = toxyParser.Parse();
                File.Delete(path);

                return(text);
            }
            catch (Exception e) when(e is InvalidDataException || e is NotSupportedException)
            {
                return(await ExtractAsSimpleText(stream));
            }
        }
示例#19
0
文件: Program.cs 项目: datadiode/toxy
        static int Main(string[] args)
        {
            try
            {
                Console.OutputEncoding = Encoding.UTF8;

                var encoding = "UTF-8";
                var flags    = Flags.None;
                var caught   = Flags.None;

                var arguments = new ArrayList(args);
                int i;
                if ((i = arguments.IndexOf("/encoding")) != -1)
                {
                    arguments.RemoveAt(i);
                    if (i < arguments.Count)
                    {
                        encoding = (string)arguments[i];
                        arguments.RemoveAt(i);
                    }
                }
                if ((i = arguments.IndexOf("/text")) != -1)
                {
                    arguments.RemoveAt(i);
                    flags |= Flags.Text;
                }
                if ((i = arguments.IndexOf("/metadata")) != -1)
                {
                    arguments.RemoveAt(i);
                    flags |= Flags.Metadata;
                }

                if (arguments.Count != 1)
                {
                    Console.WriteLine(Usage);
                    return(arguments.Count);
                }

                var filepath = (string)arguments[0];

                ParserContext context = new ParserContext(filepath);
                context.Encoding = Encoding.GetEncoding(encoding);

                ITextParser tparser = null;
                if (flags == Flags.None || (flags & Flags.Text) != 0)
                {
                    try
                    {
                        tparser = ParserFactory.CreateText(context);
                    }
                    catch (Exception e)
                    {
                        if (flags == Flags.None)
                        {
                            flags = Flags.Metadata;
                        }
                        else
                        {
                            caught |= Flags.Text;
                            Console.WriteLine(e);
                        }
                    }
                }

                if ((flags & Flags.Metadata) != 0)
                {
                    try
                    {
                        var parser = ParserFactory.CreateMetadata(context);
                        Console.WriteLine(string.Format("[{0}]", parser.GetType().Name));
                        var metadatas = parser.Parse();
                        foreach (var data in metadatas)
                        {
                            Console.WriteLine(string.Format("{0} = {1}", data.Name.PadRight(23), data.Value.ToString()));
                        }
                        Console.WriteLine();
                    }
                    catch (Exception e)
                    {
                        caught |= Flags.Metadata;
                        Console.WriteLine(e);
                    }
                }

                if (tparser != null)
                {
                    try
                    {
                        if ((flags & Flags.Text) != 0)
                        {
                            Console.WriteLine(string.Format("[{0}]", tparser.GetType().Name));
                        }
                        var text = tparser.Parse();
                        if (text.EndsWith("\r")) // as seems to happen with .doc files
                        {
                            text = text.Replace('\r', '\n');
                        }
                        Console.Write(text);
                    }
                    catch (Exception e)
                    {
                        caught |= Flags.Text;
                        Console.WriteLine(e);
                    }
                }

                return((int)caught);
            }
            catch (Exception e)
            {
                Console.Write(e);
                return(-1);
            }
        }
示例#20
0
        private void ShowDocument(string filepath, string encoding, string extension)
        {
            ParserContext context = new ParserContext(filepath);

            context.Encoding = Encoding.GetEncoding(encoding);

            if (Mode == ViewMode.Text)
            {
                AppendRichTextBox();
                var tparser = ParserFactory.CreateText(context);
                rtbPanel.Text     = tparser.Parse();
                tbParserType.Text = tparser.GetType().Name;
            }
            else if (Mode == ViewMode.Structured)
            {
                switch (extension)
                {
                case ".csv":
                    AppendSpreadsheetGrid();
                    context.Properties.Add("HasHeader", "1");
                    ISpreadsheetParser csvparser = ParserFactory.CreateSpreadsheet(context);
                    ss = csvparser.Parse();
                    tbParserType.Text = csvparser.GetType().Name;
                    var table1 = ss.Tables[0];
                    ShowToGrid(table1);
                    cbSheets.Items.Clear();
                    foreach (var table in ss.Tables)
                    {
                        cbSheets.Items.Add(table.Name);
                    }
                    cbSheets.SelectedIndex = 0;
                    panel1.Visible         = true;
                    break;

                case ".xlsx":
                case ".xls":
                    AppendSpreadsheetGrid();
                    ISpreadsheetParser ssparser = ParserFactory.CreateSpreadsheet(context);
                    ss = ssparser.Parse();
                    tbParserType.Text = ssparser.GetType().Name;
                    var table0 = ss.Tables[0];
                    ShowToGrid(table0);
                    cbSheets.Items.Clear();
                    foreach (var table in ss.Tables)
                    {
                        cbSheets.Items.Add(table.Name);
                    }
                    cbSheets.SelectedIndex = 0;
                    panel1.Visible         = true;
                    break;

                case ".vcf":
                    AppendDataGridView();
                    var vparser = ParserFactory.CreateVCard(context);
                    ToxyBusinessCards vcards = vparser.Parse();
                    tbParserType.Text             = vparser.GetType().Name;
                    gridPanel.GridView.DataSource = vcards.ToDataTable().DefaultView;
                    break;

                case ".pptx":
                    //TODO: show slides
                    break;

                case ".xml":
                case ".htm":
                case ".html":
                    AppendTreePanel();
                    var      domparser = ParserFactory.CreateDom(context);
                    ToxyDom  htmlDom   = domparser.Parse();
                    TreeNode rootNode  = treePanel.Tree.Nodes.Add(htmlDom.Root.NodeString);
                    treePanel.Tree.BeginUpdate();
                    AppendTree(rootNode, htmlDom.Root);
                    treePanel.Tree.EndUpdate();
                    //rootNode.ExpandAll();
                    break;
                }
            }
            else
            {
                AppendPropertyListPanel();
                var          tparser   = ParserFactory.CreateMetadata(context);
                ToxyMetadata metadatas = tparser.Parse();
                plPanel.Clear();
                foreach (var data in metadatas)
                {
                    plPanel.AddItem(data.Name, data.Value.ToString());
                }
                tbParserType.Text = tparser.GetType().Name;
            }
        }
示例#21
0
        private void ProcessFile(FileInfo file)
        {
            if (!file.Exists)
            {
                return;
            }
            if (!extensions.Contains(file.Extension))
            {
                return;
            }
            // \u2022 is the unicode for a bullet symbol.
            var separators = new[]
            {
                ' ', '\u2022', '’', '\"', '“', '!', '\'', '\\', '/', '_', '(', ')', '-', ',', ':', '?', ';', '.',
                '\r', '\n', '|'
            };

            try
            {
                //use toxy to extract string from files.
                //parser = ParserFactory.CreateText(new ParserContext(file.FullName));
                //checks if file has an html or xml extension.

                string      document;
                ITextParser parser;
                if (file.Extension == ".html" || file.Extension == ".htm" || file.Extension == ".xml")
                {
                    parser = ParserFactory.CreateText(new ParserContext(file.FullName));
                    string textWithTags = parser.Parse();
                    document = RemoveAllTags(textWithTags);
                }
                else if (file.Extension == ".pptx")
                {
                    document = ExtractPptxText(file);
                }
                else
                {
                    parser   = ParserFactory.CreateText(new ParserContext(file.FullName));
                    document = parser.Parse();
                }

                // Split with separators and ignore empty spaces.
                foreach (var word in document.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries))
                {
                    // Remove stop words and numeric data.
                    if (stopwords.Contains(word) || Regex.IsMatch(word, "\\d+"))
                    {
                        continue;
                    }

                    //stems word before adding it to the inverted index.
                    InvertedIndex.GetInstance()
                    .Add(stemmer.StemWord(word.Trim()), new InvertedIndex.Tuple(docId, wordPosition++));
                }
            }
            catch (Exception e) when(e is IOException || e is NullReferenceException || e is ZipException)
            {
                MessageBox.Show(@"Please close all programs using the files you want to search.");
            }
            catch (Exception e) when(e is InvalidDataException)
            {
                MessageBox.Show(@"Invalid file format.");
            }

            FileMatch.GetInstance().Add(docId, file);
            docId++;
        }