public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null) { return(true); } string extension = MapContentTypeToExtension(propertyBag.ContentType); if (extension.IsNullOrEmpty()) { return(true); } propertyBag.Title = propertyBag.Step.Uri.PathAndQuery; using (TempFile temp = new TempFile()) { temp.FileName += "." + extension; using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000)) { await fs.WriteAsync(propertyBag.Response, 0, propertyBag.Response.Length); } ParserContext context = new ParserContext(temp.FileName); ITextParser parser = ParserFactory.CreateText(context); propertyBag.Text = parser.Parse(); } return(true); }
public void TestParseLineEvent() { string path = TestDataSample.GetTextPath("utf8.txt"); ParserContext context = new ParserContext(path); PlainTextParser parser = (PlainTextParser)ParserFactory.CreateText(context); parser.ParseLine += (sender, args) => { if (args.LineNumber == 0) { Assert.AreEqual("hello world", args.Text); } else if (args.LineNumber == 1) { Assert.AreEqual("a2", args.Text); } else if (args.LineNumber == 2) { Assert.AreEqual("a3", args.Text); } else if (args.LineNumber == 3) { Assert.AreEqual("bbb4", args.Text); } }; string text = parser.Parse(); Assert.IsNotNullOrEmpty(text); }
//Parse document(option=['document'->.docx, .pdf] or ['txt'->.txt]) //Removes punctuation and returns the words of the document in an array of strings public string[] GetText(string path, string option) { string text = null; try { ParserContext context = new ParserContext(path); if (option.Equals("txt")) { ITextParser parser = ParserFactory.CreateText(context); text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ') .Replace('\t', ' '); } else if (option.Equals("document")) { IDocumentParser parser = ParserFactory.CreateDocument(context); text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ') .Replace('\t', ' '); } } catch (Exception e) { Console.WriteLine("Exception found"); Console.WriteLine(e.Message); } text = RemovePunctuation(text); string[] words = text.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries); return(words); }
public string ParseFileToString(FileInfo fileInfo) { ParserContext context = new ParserContext(fileInfo.FullName); ITextParser parser = ParserFactory.CreateText(context); string doc = parser.Parse(); return(doc); }
public void PureTextMsg_ReadTextTest() { string path = TestDataSample.GetEmailPath("raw text mail demo.msg"); ParserContext context = new ParserContext(path); var parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNullOrEmpty(result); }
public void TestExcel2003TextParser() { ParserContext context = new ParserContext(TestDataSample.GetExcelPath("Employee.xls")); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNull(result); Assert.IsTrue(result.IndexOf("Last name") > 0); Assert.IsTrue(result.IndexOf("First name") > 0); }
public void TestReadWholeText() { string path = TestDataSample.GetTextPath("utf8.txt"); ParserContext context = new ParserContext(path); ITextParser parser = ParserFactory.CreateText(context); string text = parser.Parse(); Assert.AreEqual("hello world" + Environment.NewLine + "a2" + Environment.NewLine + "a3" + Environment.NewLine + "bbb4" + Environment.NewLine, text); }
public void TestParseDirectoryFromZip() { ParserContext context = new ParserContext(TestDataSample.GetFilePath("toxy.zip", null)); ITextParser parser = ParserFactory.CreateText(context); string list = parser.Parse(); Assert.IsNotNull(list); string[] lines = list.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(68, lines.Length); }
public void TestExcel2007TextParserWithoutSheetNames() { ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx")); context.Properties.Add("IncludeSheetNames", "0"); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNull(result); Assert.IsTrue(result.IndexOf("Sheet1") < 0); }
public void TestExcel2007TextParserWithHeaderFooter() { ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx")); context.Properties.Add("IncludeHeaderFooter", "1"); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNull(result); Assert.IsTrue(result.IndexOf("This is the header") > 0); }
public void ReadTextBasicTest() { string path = Path.GetFullPath(TestDataSample.GetPowerpointPath("testPPT.pptx")); ParserContext context = new ParserContext(path); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNullOrEmpty(result); string[] texts = result.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(14, texts.Length); Assert.AreEqual("Attachment Test", texts[0]); Assert.AreEqual("Rajiv", texts[1]); Assert.AreEqual("Different words to test against", texts[4]); Assert.AreEqual("Hello", texts[7]); }
public void TestExcel2007TextParser() { ParserContext context = new ParserContext(TestDataSample.GetExcelPath("WithVariousData.xlsx")); ITextParser parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNull(result); Assert.IsTrue(result.IndexOf("Foo") > 0); Assert.IsTrue(result.IndexOf("Bar") > 0); Assert.IsTrue(result.IndexOf("a really long cell") > 0); Assert.IsTrue(result.IndexOf("have a header") > 0); Assert.IsTrue(result.IndexOf("have a footer") > 0); Assert.IsTrue(result.IndexOf("This is the header") < 0); }
public void HtmlMsg_ReadTextTest() { string path = TestDataSample.GetEmailPath("Azure pricing and services updates.msg"); ParserContext context = new ParserContext(path); var parser = ParserFactory.CreateText(context); string result = parser.Parse(); Assert.IsNotNullOrEmpty(result); Assert.IsTrue(result.IndexOf("[From] Azure Team<*****@*****.**>") >= 0); Assert.IsTrue(result.IndexOf("[To] [email protected]") > 0); Assert.IsTrue(result.IndexOf("[Subject] Azure pricing and services updates") > 0); Assert.IsFalse(result.IndexOf("[Cc]") > 0); Assert.IsFalse(result.IndexOf("[Bcc]") > 0); }
public string ExtractText(string filePath, string extension) { ParserContext c = new ParserContext(filePath); ITextParser parser = ParserFactory.CreateText(c); string text = parser.Parse(); foreach (var t in text) { if (char.IsControl(t) && t != '\n' && t != '\t' && t != '\r') { Console.Error.WriteLine("Found control character: {0} {1}", (int)t, t); return(null); } } return(text); }
public void TestParseTextFromWord() { ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.docx")); ITextParser parser = ParserFactory.CreateText(context); string doc = parser.Parse(); Assert.IsNotNull(doc); string[] lines = doc.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); Assert.AreEqual(6, lines.Length); Assert.AreEqual("I am a test document", lines[0]); Assert.AreEqual("This is page 1", lines[1]); Assert.AreEqual("I am Calibri (Body) in font size 11", lines[2]); Assert.AreEqual("This is page two", lines[3]); Assert.AreEqual("It’s Arial Black in 16 point", lines[4]); Assert.AreEqual("It’s also in blue", lines[5]); }
public void TestParseTextFromWord() { ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.doc")); ITextParser parser = ParserFactory.CreateText(context); string doc = parser.Parse(); Assert.IsNotNull(doc); string[] lines = doc.Split('\r'); Assert.AreEqual(8, lines.Length); Assert.AreEqual("I am a test document", lines[0]); Assert.AreEqual("This is page 1", lines[1]); Assert.AreEqual("I am Calibri (Body) in font size 11", lines[2]); Assert.AreEqual("\f", lines[3]); Assert.AreEqual("This is page two", lines[4]); Assert.AreEqual("It’s Arial Black in 16 point", lines[5]); Assert.AreEqual("It’s also in blue", lines[6]); Assert.AreEqual("", lines[7]); }
static void Main() { string PostJson = ""; ParserContext context = new ParserContext(@"D:\SelfStudy\PDF1\FAQ_Short.pdf"); ITextParser extractParser = ParserFactory.CreateText(context); string extractedText = extractParser.Parse(); //改行を削除 http://www.atmarkit.co.jp/ait/articles/1004/08/news094.html extractedText = extractedText.Replace("\r", "").Replace("\n", ""); // ., ? を基準に改行 extractedText = extractedText.Replace(".", ".\r\n"); extractedText = extractedText.Replace("?", "?\r\n"); //空白行を削除(.文字だけの行) http://baba-s.hatenablog.com/entry/2018/05/18/171500 extractedText = Regex.Replace ( extractedText, "^.[\r\n]+", string.Empty, RegexOptions.Multiline ); //text書き出し: テスト時無効に /*** * StreamWriter sw_out = new StreamWriter(@"D:\SelfStudy\PDF1\UFR.txt", true, Encoding.GetEncoding("shift_jis")); * sw_out.Write(extractedText); * sw_out.Close(); ***/ //JsonFile作成 MakeJSON(ref PostJson); //参照渡し:https://dobon.net/vb/dotnet/beginner/byvalbyref.html MakeRequest(PostJson); }
public async Task <string> Extract(string fileName, IContent content, CancellationToken cancellationToken) { await using var stream = await content.OpenReadStream(cancellationToken); try { var path = Path.Combine(Path.GetTempPath(), fileName); var toxyParser = ParserFactory.CreateText(new ParserContext(path)); await using (var fileStream = File.Create(path, 81920, FileOptions.Asynchronous)) { await stream.CopyToAsync(fileStream, cancellationToken); } var text = toxyParser.Parse(); File.Delete(path); return(text); } catch (Exception e) when(e is InvalidDataException || e is NotSupportedException) { return(await ExtractAsSimpleText(stream)); } }
static int Main(string[] args) { try { Console.OutputEncoding = Encoding.UTF8; var encoding = "UTF-8"; var flags = Flags.None; var caught = Flags.None; var arguments = new ArrayList(args); int i; if ((i = arguments.IndexOf("/encoding")) != -1) { arguments.RemoveAt(i); if (i < arguments.Count) { encoding = (string)arguments[i]; arguments.RemoveAt(i); } } if ((i = arguments.IndexOf("/text")) != -1) { arguments.RemoveAt(i); flags |= Flags.Text; } if ((i = arguments.IndexOf("/metadata")) != -1) { arguments.RemoveAt(i); flags |= Flags.Metadata; } if (arguments.Count != 1) { Console.WriteLine(Usage); return(arguments.Count); } var filepath = (string)arguments[0]; ParserContext context = new ParserContext(filepath); context.Encoding = Encoding.GetEncoding(encoding); ITextParser tparser = null; if (flags == Flags.None || (flags & Flags.Text) != 0) { try { tparser = ParserFactory.CreateText(context); } catch (Exception e) { if (flags == Flags.None) { flags = Flags.Metadata; } else { caught |= Flags.Text; Console.WriteLine(e); } } } if ((flags & Flags.Metadata) != 0) { try { var parser = ParserFactory.CreateMetadata(context); Console.WriteLine(string.Format("[{0}]", parser.GetType().Name)); var metadatas = parser.Parse(); foreach (var data in metadatas) { Console.WriteLine(string.Format("{0} = {1}", data.Name.PadRight(23), data.Value.ToString())); } Console.WriteLine(); } catch (Exception e) { caught |= Flags.Metadata; Console.WriteLine(e); } } if (tparser != null) { try { if ((flags & Flags.Text) != 0) { Console.WriteLine(string.Format("[{0}]", tparser.GetType().Name)); } var text = tparser.Parse(); if (text.EndsWith("\r")) // as seems to happen with .doc files { text = text.Replace('\r', '\n'); } Console.Write(text); } catch (Exception e) { caught |= Flags.Text; Console.WriteLine(e); } } return((int)caught); } catch (Exception e) { Console.Write(e); return(-1); } }
private void ShowDocument(string filepath, string encoding, string extension) { ParserContext context = new ParserContext(filepath); context.Encoding = Encoding.GetEncoding(encoding); if (Mode == ViewMode.Text) { AppendRichTextBox(); var tparser = ParserFactory.CreateText(context); rtbPanel.Text = tparser.Parse(); tbParserType.Text = tparser.GetType().Name; } else if (Mode == ViewMode.Structured) { switch (extension) { case ".csv": AppendSpreadsheetGrid(); context.Properties.Add("HasHeader", "1"); ISpreadsheetParser csvparser = ParserFactory.CreateSpreadsheet(context); ss = csvparser.Parse(); tbParserType.Text = csvparser.GetType().Name; var table1 = ss.Tables[0]; ShowToGrid(table1); cbSheets.Items.Clear(); foreach (var table in ss.Tables) { cbSheets.Items.Add(table.Name); } cbSheets.SelectedIndex = 0; panel1.Visible = true; break; case ".xlsx": case ".xls": AppendSpreadsheetGrid(); ISpreadsheetParser ssparser = ParserFactory.CreateSpreadsheet(context); ss = ssparser.Parse(); tbParserType.Text = ssparser.GetType().Name; var table0 = ss.Tables[0]; ShowToGrid(table0); cbSheets.Items.Clear(); foreach (var table in ss.Tables) { cbSheets.Items.Add(table.Name); } cbSheets.SelectedIndex = 0; panel1.Visible = true; break; case ".vcf": AppendDataGridView(); var vparser = ParserFactory.CreateVCard(context); ToxyBusinessCards vcards = vparser.Parse(); tbParserType.Text = vparser.GetType().Name; gridPanel.GridView.DataSource = vcards.ToDataTable().DefaultView; break; case ".pptx": //TODO: show slides break; case ".xml": case ".htm": case ".html": AppendTreePanel(); var domparser = ParserFactory.CreateDom(context); ToxyDom htmlDom = domparser.Parse(); TreeNode rootNode = treePanel.Tree.Nodes.Add(htmlDom.Root.NodeString); treePanel.Tree.BeginUpdate(); AppendTree(rootNode, htmlDom.Root); treePanel.Tree.EndUpdate(); //rootNode.ExpandAll(); break; } } else { AppendPropertyListPanel(); var tparser = ParserFactory.CreateMetadata(context); ToxyMetadata metadatas = tparser.Parse(); plPanel.Clear(); foreach (var data in metadatas) { plPanel.AddItem(data.Name, data.Value.ToString()); } tbParserType.Text = tparser.GetType().Name; } }
private void ProcessFile(FileInfo file) { if (!file.Exists) { return; } if (!extensions.Contains(file.Extension)) { return; } // \u2022 is the unicode for a bullet symbol. var separators = new[] { ' ', '\u2022', '’', '\"', '“', '!', '\'', '\\', '/', '_', '(', ')', '-', ',', ':', '?', ';', '.', '\r', '\n', '|' }; try { //use toxy to extract string from files. //parser = ParserFactory.CreateText(new ParserContext(file.FullName)); //checks if file has an html or xml extension. string document; ITextParser parser; if (file.Extension == ".html" || file.Extension == ".htm" || file.Extension == ".xml") { parser = ParserFactory.CreateText(new ParserContext(file.FullName)); string textWithTags = parser.Parse(); document = RemoveAllTags(textWithTags); } else if (file.Extension == ".pptx") { document = ExtractPptxText(file); } else { parser = ParserFactory.CreateText(new ParserContext(file.FullName)); document = parser.Parse(); } // Split with separators and ignore empty spaces. foreach (var word in document.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries)) { // Remove stop words and numeric data. if (stopwords.Contains(word) || Regex.IsMatch(word, "\\d+")) { continue; } //stems word before adding it to the inverted index. InvertedIndex.GetInstance() .Add(stemmer.StemWord(word.Trim()), new InvertedIndex.Tuple(docId, wordPosition++)); } } catch (Exception e) when(e is IOException || e is NullReferenceException || e is ZipException) { MessageBox.Show(@"Please close all programs using the files you want to search."); } catch (Exception e) when(e is InvalidDataException) { MessageBox.Show(@"Invalid file format."); } FileMatch.GetInstance().Add(docId, file); docId++; }