public override void Run( ) { // 1. Opening the PDF file... string filePath = PromptFileChoice("Please select a PDF file"); using(File file = new File(filePath)) { Document document = file.Document; // 2. Text extraction from the document pages. TextExtractor extractor = new TextExtractor(); foreach(Page page in document.Pages) { if(!PromptNextPage(page, false)) { Quit(); break; } IList<ITextString> textStrings = extractor.Extract(page)[TextExtractor.DefaultArea]; foreach(ITextString textString in textStrings) { RectangleF textStringBox = textString.Box.Value; Console.WriteLine( "Text [" + "x:" + Math.Round(textStringBox.X) + "," + "y:" + Math.Round(textStringBox.Y) + "," + "w:" + Math.Round(textStringBox.Width) + "," + "h:" + Math.Round(textStringBox.Height) + "]: " + textString.Text ); } } } }
public override void Run( ) { // 1. Opening the PDF file... string filePath = PromptFileChoice("Please select a PDF file"); using(File file = new File(filePath)) { // Define the text pattern to look for! string textRegEx = PromptChoice("Please enter the pattern to look for: "); Regex pattern = new Regex(textRegEx, RegexOptions.IgnoreCase); // 2. Iterating through the document pages... TextExtractor textExtractor = new TextExtractor(true, true); foreach(Page page in file.Document.Pages) { Console.WriteLine("\nScanning page " + (page.Index+1) + "...\n"); // 2.1. Extract the page text! IDictionary<RectangleF?,IList<ITextString>> textStrings = textExtractor.Extract(page); // 2.2. Find the text pattern matches! MatchCollection matches = pattern.Matches(TextExtractor.ToString(textStrings)); // 2.3. Highlight the text pattern matches! textExtractor.Filter( textStrings, new TextHighlighter(page, matches) ); } // 3. Highlighted file serialization. Serialize(file); } }
public Files tika_metadata() { var result = _cut.Extract(file); var files = new Files() { Id = Path.GetFileName(file), ContentType = result.ContentType, Body = result.Text, FilePath = Path.GetDirectoryName(file), CreatedDate = File.GetCreationTime(file), ModifiedDate = File.GetLastWriteTime(file) }; if (result.Metadata.ContainsKey("title")) { files.Title = result.Metadata["title"]; } else { files.Title = file; } if (result.Metadata.ContainsKey("Content-Length")) { long longtest; if (long.TryParse(result.Metadata["Content-Length"], out longtest)) { files.ContentLength = longtest; } } if (result.Metadata.ContainsKey("Author")) { files.Author = result.Metadata["Author"]; } return(files); }
public override void Run( ) { // 1. Opening the PDF file... string filePath = PromptFileChoice("Please select a PDF file"); using (var file = new File(filePath)) { Document document = file.Document; // 2. Text extraction from the document pages. TextExtractor extractor = new TextExtractor(); foreach (Page page in document.Pages) { if (!PromptNextPage(page, false)) { Quit(); break; } IList <ITextString> textStrings = extractor.Extract(page)[TextExtractor.DefaultArea]; foreach (ITextString textString in textStrings) { RectangleF textStringBox = textString.Box.Value; Console.WriteLine( "Text [" + "x:" + Math.Round(textStringBox.X) + "," + "y:" + Math.Round(textStringBox.Y) + "," + "w:" + Math.Round(textStringBox.Width) + "," + "h:" + Math.Round(textStringBox.Height) + "]: " + textString.Text ); } } } }
public void TestExtractMethodWithUndefinedBytes() { Assert.ThrowsException <ArgumentNullException>( () => TextExtractor.Extract((byte[])null) ); }
/// <summary> /// Extracts data from a file asynchronously. /// </summary> /// <param name="extractor">The extractor.</param> /// <param name="rawData">The raw data.</param> /// <returns></returns> public static Task <TextExtractionResult> ExtractAsync(this TextExtractor extractor, byte[] rawData) => Task.Run(() => extractor.Extract(rawData));
public void TestExtractMethodWithUndefinedStream() { Assert.ThrowsException <ArgumentNullException>( () => TextExtractor.Extract((Stream)null) ); }
public ActionResult Upload() { if (Request.Files.Count > 0) { var file = Request.Files[0]; if (file != null && file.ContentLength > 0) { var fileName = Path.GetFileName(file.FileName); string[] fileNameSplit = fileName.Split('.'); var myUniqueFileName = string.Format(@"{0}.{1}", DateTime.Now.Ticks, fileNameSplit[1]); var path = Path.Combine(Server.MapPath("~/Files/"), myUniqueFileName);//TO DO change path file.SaveAs(path); string author = ""; string title = ""; string description = ""; if (fileNameSplit[1] != "pdf") { var textExtractor = new TextExtractor(); var wordDocContents = textExtractor.Extract(path); if (!wordDocContents.Metadata.TryGetValue("Author", out author)) { author = ""; } if (!wordDocContents.Metadata.TryGetValue("title", out title)) { title = ""; } if (!wordDocContents.Metadata.TryGetValue("description", out description)) { description = ""; } Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); Lucene.Net.Store.Directory directory = FSDirectory.Open(@"C:\Users\Totalit\Source\Repos\FilesAnalyzer\FilesAnalyzer\FilesAnalyzer\IndexedFiles"); IndexWriter writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.Add(new Field("Author", author, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("description", description, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("text", wordDocContents.Text, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); writer.Optimize(); writer.Close(); } else { // TO DO html version } } } return(View()); }
public override void Run( ) { // 1. Opening the PDF file... string filePath = PromptFileChoice("Please select a PDF file"); using (files::File file = new files::File(filePath)) { Document document = file.Document; // 2. Link extraction from the document pages. TextExtractor extractor = new TextExtractor(); extractor.AreaTolerance = 2; // 2 pt tolerance on area boundary detection. bool linkFound = false; foreach (Page page in document.Pages) { if (!PromptNextPage(page, !linkFound)) { Quit(); break; } IDictionary <RectangleF?, IList <ITextString> > textStrings = null; linkFound = false; // Get the page annotations! PageAnnotations annotations = page.Annotations; if (!annotations.Exists()) { Console.WriteLine("No annotations here."); continue; } // Iterating through the page annotations looking for links... foreach (Annotation annotation in annotations) { if (annotation is Link) { linkFound = true; if (textStrings == null) { textStrings = extractor.Extract(page); } Link link = (Link)annotation; RectangleF linkBox = link.Box; // Text. /* * Extracting text superimposed by the link... * NOTE: As links have no strong relation to page text but a weak location correspondence, * we have to filter extracted text by link area. */ StringBuilder linkTextBuilder = new StringBuilder(); foreach (ITextString linkTextString in extractor.Filter(textStrings, linkBox)) { linkTextBuilder.Append(linkTextString.Text); } Console.WriteLine("Link '" + linkTextBuilder + "' "); // Position. Console.WriteLine( " Position: " + "x:" + Math.Round(linkBox.X) + "," + "y:" + Math.Round(linkBox.Y) + "," + "w:" + Math.Round(linkBox.Width) + "," + "h:" + Math.Round(linkBox.Height) ); // Target. Console.Write(" Target: "); PdfObjectWrapper target = link.Target; if (target is Destination) { PrintDestination((Destination)target); } else if (target is actions::Action) { PrintAction((actions::Action)target); } else if (target == null) { Console.WriteLine("[not available]"); } else { Console.WriteLine("[unknown type: " + target.GetType().Name + "]"); } } } if (!linkFound) { Console.WriteLine("No links here."); continue; } } } }
public void should_extract_mp4() { var textExtractionResult = _cut.Extract("files/badgers.mp4"); textExtractionResult.ContentType.ShouldEqual("video/mp4"); var fileInfo = new FileInfo(@"C:\projects\tikaondotnet\src\TikaOnDotNet.Tests\bin\Debug\files\badgers.mp4"); fileInfo.Delete(); fileInfo.Exists.ShouldBeFalse(); }
public void TestExtractMethodWithMissingBytes() { Assert.ThrowsException <NotSupportedException>( () => TextExtractor.Extract(new byte[] { }) ); }
public void non_existing_files_should_fail_with_exception() { const string fileName = "files/doesnotexist.mp3"; Action act = () => _cut.Extract(fileName); act.ShouldThrow <TextExtractionException>() .Which.Message.Should().Contain(fileName); }
private TextExtractionResult ExtractFromCdn(string cdn) { var textExtractResult = _cut.Extract(new Uri(cdn)); return(textExtractResult); }
static void Main(string[] args) { Dictionary <int, string> pdfPages = TextExtractor.Extract(args[0]); }
public void TestExtractMethodWithValidFileWithWhitespacePreserveFile() { var text = TextExtractor.Extract(@"Documents\ValidWithWhitespacePreserve.docx"); Assert.IsTrue(text == "This is a Word document."); }
public void TestExtractMethodWithInvalidFile() { Assert.ThrowsException <NotSupportedException>( () => TextExtractor.Extract(@"Documents\Invalid.docx") ); }
public void TestExtractMethodWithMissingFile() { Assert.ThrowsException <FileNotFoundException>( () => TextExtractor.Extract(string.Empty) ); }
public XDocument FixPageRef(string hfccFilePath, string pdfFilePath, string footerPrefix) { var hfcc = new InputData(hfccFilePath); IEnumerable <XElement> sections = SplitSection(hfcc); if (sections.Any()) { Dictionary <int, string> pdfPages = TextExtractor.Extract(pdfFilePath); var collected = new HashSet <int>(); foreach (XElement section in sections) { XElement kenmerkgrp = section.Element("kenmerkgrp"); if (kenmerkgrp == null) { continue; } XElement kenmerk = kenmerkgrp.Element("kenmerk"); if (kenmerk == null) { continue; } XElement commentaarcontent = section.Element("commentaarcontent"); if (commentaarcontent == null) { continue; } XElement verhandelingalgemeen = commentaarcontent.Element("verhandelingalgemeen"); if (verhandelingalgemeen == null) { continue; } string query = footerPrefix + kenmerk.Value.Trim(); var pagesNo = new Dictionary <int, string>(); foreach (var pdfPage in pdfPages) { if (pdfPage.Value.Contains(query)) { pagesNo[pdfPage.Key] = pdfPage.Value; } } XElement samenvatting = verhandelingalgemeen.Element("samenvatting"); foreach (var pageNo in pagesNo) { string[] lines = pageNo.Value.Split('\n'); int lastLinesHasBtwBrief = lines.Count() - 1; while (lastLinesHasBtwBrief > -1) { if (lines[lastLinesHasBtwBrief].Contains(query)) { break; } lastLinesHasBtwBrief--; } if (lastLinesHasBtwBrief + 1 == lines.Count()) { continue; } string firstSentence = lines[lastLinesHasBtwBrief + 1].Trim().TrimEnd('-').Replace(" ", "").ToLower(); if (samenvatting != null && ValueOfElement(samenvatting).Contains(firstSentence)) { samenvatting.Add(new XElement("a", new XAttribute("type", "pageref"), pageNo.Key)); collected.Add(pageNo.Key); } else { bool found = false; IEnumerable <XElement> commentaarcontentabloks = verhandelingalgemeen.Elements("ablok"); foreach (XElement commentaarcontentablok in commentaarcontentabloks) { if (ValueOfElement(commentaarcontentablok).Contains(firstSentence)) { commentaarcontentablok.Add(new XElement("a", new XAttribute("type", "pageref"), pageNo.Key)); collected.Add(pageNo.Key); found = true; break; } } if (!found) { IEnumerable <XElement> ps = verhandelingalgemeen.Elements("p"); bool b = false; foreach (XElement p in ps) { IEnumerable <XElement> pAbloks = p.Descendants().Where(e => "ablok".Equals(e.Name.LocalName)); foreach (XElement pAblok in pAbloks) { if (ValueOfElement(pAblok).Contains(firstSentence)) { pAblok.Add(new XElement("a", new XAttribute("type", "pageref"), pageNo.Key)); collected.Add(pageNo.Key); b = true; break; } } if (b) { break; } } } } } } IEnumerable <int> outer = pdfPages.Keys.Where(k => !collected.Contains(k)); if (outer.Any()) { Console.WriteLine("Couldn't add page ref: " + string.Join(", ", outer)); Debug.WriteLine("Couldn't add page ref: " + string.Join(", ", outer)); } } return(hfcc.Document); }
public List <Transaction> RunOld(int startPage, int endPage, bool usePageNum = false) { if (endPage <= startPage) { throw new Exception("End page <= Start page."); } var startIdx = startPage - 1; var cnt = endPage - startPage; var textStrings = new List <string>(); var transactions = new List <Transaction>(); // 1. Opening the PDF file... using (File file = new File(filePath)) { var document = file.Document; //if (usePageNum) //{ //if (endPage >= document.Pages.Count) //{ // throw new Exception("Endpage > total pages."); // } //return UsePageNum(startIdx, cnt, document); // } // 2. Text extraction from the document pages. var extractor = new TextExtractor(); var foundAccount = false; var start = false; var pgCnt = 0; var lastAccntPage = -1; foreach (var page in document.Pages) { pgCnt++; //if (!PromptNextPage(page, false)) //{ // break; //} IList <ITextString> pageTextStrings = new List <ITextString>(); try { pageTextStrings = extractor.Extract(page)[TextExtractor.DefaultArea]; } catch { continue; } if (pageTextStrings == null || pageTextStrings.Count == 0) { continue; } if (lastAccntPage == -1 && !pageTextStrings.Any(x => x.Text.Contains(account))) { continue; } if (lastAccntPage > -1 && pgCnt - 1 != lastAccntPage) { continue; } lastAccntPage = pgCnt; Transaction transactionNowAndPrevious = null; foreach (ITextString textString in pageTextStrings) { if (textString.Text.Contains(account)) { foundAccount = true; continue; } if (textString.Text.Contains(eof)) { foundAccount = false; break; } if (!foundAccount) { continue; } if (textString.Text.Contains(flag)) { start = true; continue; } if (textString.Text.Contains(flag1)) { start = true; continue; } if (textString.Text.Contains(flag2)) { start = true; continue; } if (start && textString.Text.Contains("Beginning")) { continue; } var line = textString.Text.Trim().Split(' '); long test; if (line.Length == 1 && !string.IsNullOrEmpty(line[0]) && long.TryParse(line[0], out test)) { start = false; } if (!start) { continue; } decimal amnt = 0; decimal balance = 0; var dateTest = DateTime.MinValue; if (line.Length > 1 && decimal.TryParse(line[line.Length - 2], out amnt) && decimal.TryParse(line[line.Length - 1], out balance)) { var desc = line.ToList(); DateTime.TryParse(line[0], out dateTest); desc.RemoveAt(line.Length - 1); desc.RemoveAt(line.Length - 2); desc.RemoveAt(0); desc = desc.Select(x => x.Trim().Replace(',', '|')).ToList(); transactionNowAndPrevious = new Transaction { Amount = amnt, Date = dateTest, Description = string.Join(" ", desc), Misc = textString.Box.ToString(), Type = textString.TextChars.First(x => x.Virtual == true).Box.Width < 120 ? TransactionType.Deposit : TransactionType.Withdrawal }; } else if (line.Length > 0 && decimal.TryParse(line[line.Length - 1], out amnt) && DateTime.TryParse(line[0], out dateTest)) { // No balance var desc = line.ToList(); desc.RemoveAt(line.Length - 1); desc.RemoveAt(0); desc = desc.Select(x => x.Trim().Replace(',', '|')).ToList(); transactionNowAndPrevious = new Transaction { Amount = amnt, Date = dateTest, Description = string.Join(" ", desc), Misc = textString.Box.ToString(), Type = textString.TextChars.First(x => x.Virtual == true).Box.Width < 120 ? TransactionType.Deposit : TransactionType.Withdrawal }; } else { transactionNowAndPrevious = transactionNowAndPrevious ?? new Transaction(); transactionNowAndPrevious.Description += $" {textString.Text}"; transactions.Add(transactionNowAndPrevious); } } } } return(transactions); }
/// <summary> /// Adds words from the Specified Document to the specified Inverted Index Table /// </summary> /// <param name="doc">The document to be Tokenized.</param> /// <exception cref="TextExtractionException">Could not extract Files from the Document</exception> public static void AddFileFrom(Document doc) { String[] words = Semanter.Splitwords(x.Extract(doc.Address).Text); invt.AddDocument(words, doc); }
public void TestExtractMethodWithMissingStream() { Assert.ThrowsException <NotSupportedException>( () => TextExtractor.Extract(new MemoryStream()) ); }
/// <summary> /// 指定したテキスト抽出器でテキスト化したものをインデックス化 /// テキスト抽出器の種類は以下のとおり /// ・Apache Tika /// ・IFilter /// </summary> /// <param name="path"></param> /// <param name="indexWriter"></param> private bool AddDocument(string path, IndexWriter indexWriter, string threadName, Dictionary <string, DocInfo> docDic) { string filename = System.IO.Path.GetFileName(path); string extension = System.IO.Path.GetExtension(path); FileInfo fi = new FileInfo(path); if (extension == "" || !_targetExtensionDic.ContainsKey(extension.ToLower())) { //拡張子なし or 対象拡張子外 AppObject.Logger.Info(threadName + ":" + "Out of target extension. Skipped: " + path); Interlocked.Increment(ref _skippedCount); return(false); } if (extension.ToLower() != ".mp4" && fi.Length > this.FileSizeLimit) { //サイズオーバー(mp4は対象外) AppObject.Logger.Info(threadName + ":" + "File size over. Skipped: " + path); Interlocked.Increment(ref _skippedCount); return(false); } //存在するドキュメントか? if (docDic != null && docDic.ContainsKey(path)) { DocInfo di = docDic[path]; di.Exists = true; docDic[path] = di; //更新日時チェック(秒単位で比較) if (di.UpdateDate < DateTimeUtil.Truncate(fi.LastWriteTime, TimeSpan.FromSeconds(1))) { //更新されている場合Delete+Insert Term t = new Term(LuceneIndexBuilder.Path, di.Path); indexWriter.DeleteDocuments(t); } else { //更新されていない。 AppObject.Logger.Info(threadName + ":" + "No updated. Skipped: " + path); Interlocked.Increment(ref _skippedCount); return(false); } } //ドキュメント追加 Document doc = new Document(); if (extension.ToLower() == ".md") { //Markdown形式 string content = ReadToString(path); doc.Add(new Field(Content, content, _hilightFieldType)); } else if (extension.ToLower() == ".txt") { //TXTファイル var sjis = Encoding.GetEncoding("Shift_JIS"); if (FileUtil.GetTextEncoding(path) == sjis) { string content = ""; using (var reader = new StreamReader(path, sjis)) { content = reader.ReadToEnd(); } doc.Add(new Field(Content, content, _hilightFieldType)); } else { if (_txtExtractMode == TextExtractModes.Tika) { var content = _txtExtractor.Extract(path); doc.Add(new Field(Content, content.Text, _hilightFieldType)); } else { doc.Add(new Field(Content, IFilterParser.Parse(path), _hilightFieldType)); } } } else { if (_txtExtractMode == TextExtractModes.Tika) { var content = _txtExtractor.Extract(path); doc.Add(new Field(Content, content.Text, _hilightFieldType)); } else { doc.Add(new Field(Content, IFilterParser.Parse(path), _hilightFieldType)); } } doc.Add(new StringField(Path, path, FieldStore.YES)); doc.Add(new StringField(Title, filename.ToLower(), FieldStore.YES)); doc.Add(new StringField(Extension, extension.ToLower(), FieldStore.YES)); //NOTE:Date型のFieldは存在しないのでlongで保持 long l = long.Parse(fi.LastWriteTime.ToString("yyyyMMddHHmmss")); doc.Add(new LongPoint(UpdateDate, l)); doc.Add(new StoredField(UpdateDate, l)); //doc.Add(new StringField(UpdateDate, // DateTools.DateToString(_sdf.parse(fi.LastWriteTime.ToString("yyyy/MM/dd")), DateToolsResolution.DAY), // FieldStore.YES)); indexWriter.AddDocument(doc); return(true); }
public void should_extract_author_list_from_pdf() { var textExtractionResult = _cut.Extract("files/file_author.pdf", CreateCustomResult); textExtractionResult.Metadata["meta:author"].Should().ContainInOrder("Bernal, M. A.", "deAlmeida, C. E.", "Incerti, S.", "Champion, C.", "Ivanchenko, V.", "Francis, Z."); }
public override void Run( ) { // 1. Opening the PDF file... string filePath = PromptFileChoice("Please select a PDF file"); using(files::File file = new files::File(filePath)) { Document document = file.Document; // 2. Link extraction from the document pages. TextExtractor extractor = new TextExtractor(); extractor.AreaTolerance = 2; // 2 pt tolerance on area boundary detection. bool linkFound = false; foreach(Page page in document.Pages) { if(!PromptNextPage(page, !linkFound)) { Quit(); break; } IDictionary<RectangleF?,IList<ITextString>> textStrings = null; linkFound = false; // Get the page annotations! PageAnnotations annotations = page.Annotations; if(!annotations.Exists()) { Console.WriteLine("No annotations here."); continue; } // Iterating through the page annotations looking for links... foreach(Annotation annotation in annotations) { if(annotation is Link) { linkFound = true; if(textStrings == null) {textStrings = extractor.Extract(page);} Link link = (Link)annotation; RectangleF linkBox = link.Box; // Text. /* Extracting text superimposed by the link... NOTE: As links have no strong relation to page text but a weak location correspondence, we have to filter extracted text by link area. */ StringBuilder linkTextBuilder = new StringBuilder(); foreach(ITextString linkTextString in extractor.Filter(textStrings,linkBox)) {linkTextBuilder.Append(linkTextString.Text);} Console.WriteLine("Link '" + linkTextBuilder + "' "); // Position. Console.WriteLine( " Position: " + "x:" + Math.Round(linkBox.X) + "," + "y:" + Math.Round(linkBox.Y) + "," + "w:" + Math.Round(linkBox.Width) + "," + "h:" + Math.Round(linkBox.Height) ); // Target. Console.Write(" Target: "); PdfObjectWrapper target = link.Target; if(target is Destination) {PrintDestination((Destination)target);} else if(target is actions::Action) {PrintAction((actions::Action)target);} else if(target == null) {Console.WriteLine("[not available]");} else {Console.WriteLine("[unknown type: " + target.GetType().Name + "]");} } } if(!linkFound) { Console.WriteLine("No links here."); continue; } } } }
public IEnumerable <Client> Deserialize() { const string matchIdExp = @"[0-9]+"; const string matchNameExp = @"([^0-9]\w+\s)+"; const string matchDiscountExp = @"[0-9]+[.][0-9]+[%]"; Regex idRegex = new Regex(matchIdExp); Regex nameRegex = new Regex(matchNameExp); Regex discountRegex = new Regex(matchDiscountExp); Regex extractRegex = new Regex(@"[0-9]+\s(\w+\s)+[0-9]+[.][0-9]+[%]"); File file; try { file = new File(FilePath); } catch (System.IO.FileNotFoundException) { throw; } if (file != null) { Document pdfDocument = file.Document; TextExtractor textExtractor = new TextExtractor(); foreach (Page page in pdfDocument.Pages) { IList <ITextString> textStrings = textExtractor.Extract(page)[TextExtractor.DefaultArea]; foreach (ITextString textString in textStrings) { StringBuilder sb = new StringBuilder(); foreach (char letter in textString.Text) { sb.Append(letter); } // The whole line string finalString = sb.ToString().Trim(); Match match = extractRegex.Match(finalString); if (match.Success) { Match matchId = idRegex.Match(finalString); Match matchName = nameRegex.Match(finalString); Match matchDiscount = discountRegex.Match(finalString); string idString = matchId.Value.Trim(); string nameString = matchName.Value.Trim(); string discountString = matchDiscount.Value.Trim(); // remove percentage sign from discount string discountString = discountString.Remove(discountString.Length - 1); Client client = new Client(); client.Id = Convert.ToInt32(idString); client.Name = nameString; client.Discount = (float)Convert.ToDecimal(discountString); yield return(client); } } } } else { throw new NullReferenceException(); } }
public List <Transaction> Extract(Document document) { var extractor = new TextExtractor(); var inProgress = false; var transactions = new List <Transaction>(); var pgCnt = 0; var year = 1977; foreach (var page in document.Pages) { pgCnt++; IList <ITextString> pageTextStrings = new List <ITextString>(); try { pageTextStrings = extractor.Extract(page)[TextExtractor.DefaultArea]; } catch { continue; } var headerSplit = pageTextStrings[0].Text.Split(' ').ToList(); var dashIdx = headerSplit.IndexOf("-"); if (dashIdx > -1) { int.TryParse(headerSplit[dashIdx - 1], out year); } var accountPage = pageTextStrings.Any(x => x.Text.Contains(account)); var hasBalances = pageTextStrings.Any(x => x.Text.Contains(begining)); inProgress = (!inProgress && accountPage && hasBalances) || inProgress; if (!inProgress) { continue; } var hasEof = pageTextStrings.Any(y => y.Text.Contains(eof)); inProgress = !hasEof; var startIdx = -1; var contText = pageTextStrings.FirstOrDefault(y => y.Text.Contains(flag1)); if (hasBalances) { startIdx = pageTextStrings.IndexOf(pageTextStrings.First(y => y.Text.Contains(begining))) + 1; } else if (contText != null) { startIdx = pageTextStrings.IndexOf(contText) + 1; } else { throw new Exception("Could not determine where to start!"); } Transaction transactionNowAndPrevious = null; for (var i = startIdx; i < pageTextStrings.Count; i++) { var textString = pageTextStrings[i]; var textParts = textString.Text.Trim().Split(' '); long eofTest; if (textParts.Length == 1 && !string.IsNullOrEmpty(textParts[0]) && long.TryParse(textParts[0], out eofTest)) { // end of page continue; } if (textString.Text.Contains(eof)) { break; } decimal amnt = 0; decimal balance = 0; DateTime dateTest = DateTime.MinValue; if (textParts.Length > 1 && textParts[textParts.Length - 2].Contains(".") && decimal.TryParse(textParts[textParts.Length - 2], out amnt) && decimal.TryParse(textParts[textParts.Length - 1], out balance)) { if (textParts[0].Contains("/")) { var dateString = $"{textParts[0]}/{year}"; DateTime.TryParse(dateString, out dateTest); } var desc = textParts.ToList(); desc.RemoveAt(textParts.Length - 1); desc.RemoveAt(textParts.Length - 2); desc.RemoveAt(0); desc = desc.Select(x => x.Trim().Replace(',', '|')).ToList(); var tstChrs = textString.TextChars.Where(y => y.Virtual == true).ToList(); var tst = tstChrs.Count > 2 ? tstChrs[tstChrs.Count - 3] : tstChrs[0]; var typ = tst.Box.Right < 433 ? TransactionType.Deposit : TransactionType.Withdrawal; int cardTst = -1; int?card = null; if ((desc.Count > 1 && desc[desc.Count - 2].ToLower() == "card" || desc[desc.Count - 1].Trim().Length == 4) && int.TryParse(desc[desc.Count - 1], out cardTst)) { card = cardTst; } transactionNowAndPrevious = new Transaction { Id = Guid.NewGuid(), Amount = typ == TransactionType.Deposit ? amnt : amnt * -1, Date = dateTest, Description = string.Join(" ", desc), Type = typ, Card = card }; } else if (textParts.Length > 0 && decimal.TryParse(textParts[textParts.Length - 1], out amnt) && DateTime.TryParse($"{textParts[0]}/{year}", out dateTest)) { // No balance var desc = textParts.ToList(); desc.RemoveAt(textParts.Length - 1); desc.RemoveAt(0); desc = desc.Select(x => x.Trim().Replace(',', '|')).ToList(); int cardTst = -1; int?card = null; if ((desc.Count > 1 && desc[desc.Count - 2].ToLower() == "card" || desc[desc.Count - 1].Trim().Length == 4) && int.TryParse(desc[desc.Count - 1], out cardTst)) { card = cardTst; } var tstChrs = textString.TextChars.Where(y => y.Virtual == true).ToList(); var tst = tstChrs.Count > 1 ? tstChrs[tstChrs.Count - 2] : tstChrs[0]; var typ = tst.Box.Right < 433 ? TransactionType.Deposit : TransactionType.Withdrawal; transactionNowAndPrevious = new Transaction { Id = Guid.NewGuid(), Amount = typ == TransactionType.Deposit ? amnt : amnt * -1, Date = dateTest, Description = string.Join(" ", desc), Type = typ, Card = card }; } else { int cardTst = -1; int?card = null; var desc = textString.Text.Trim().Split(' ').ToList(); if ((desc.Count > 1 && desc[desc.Count - 2].ToLower() == "card" || desc[desc.Count - 1].Trim().Length == 4) && int.TryParse(desc[desc.Count - 1], out cardTst)) { card = cardTst; } transactionNowAndPrevious = transactionNowAndPrevious ?? new Transaction(); transactionNowAndPrevious.Card = card; transactionNowAndPrevious.Description += $" {textString.Text}"; transactions.Add(transactionNowAndPrevious); } } } return(transactions); }
static void Main(string[] args) { ServicePointManager.DefaultConnectionLimit = 10000; //(Or More) System.Console.WriteLine(String.Format("Total Min: {0}", DateTime.Now.Subtract(stTime).TotalMinutes)); Console.WriteLine("{0}", "Deleting index...\n"); DeleteIndexIfExists(SearchClient, SearchIndexName); Console.WriteLine("{0}", "Creating index...\n"); CreateIndex(SearchClient); Console.WriteLine("{0}", "Getting file...\n"); GetFiles(); ParallelOptions po = new ParallelOptions(); po.MaxDegreeOfParallelism = Parallelism; int docCounter = 0; Parallel.ForEach(FilesToProcess, po, fileList => { Console.WriteLine(String.Format("Processing batch #{0}", fileList.Key)); var textExtractor = new TextExtractor(); CloudStorageAccount blobStorageAccount = CloudStorageAccount.Parse(SourceBlobConectionString); var blobBlobClient = blobStorageAccount.CreateCloudBlobClient(); var blobContainer = blobBlobClient.GetContainerReference(SourceBlobContainer); var containerUrl = blobContainer.Uri.AbsoluteUri; SearchIndexClient indexClient = new SearchIndexClient(SearchServiceName, SearchIndexName, new SearchCredentials(SearchAdminApiKey)); SearchIndexSchema schema = new SearchIndexSchema(); var indexActionList = new List <IndexAction <SearchIndexSchema> >(); foreach (var file in fileList.Value) { try { Interlocked.Increment(ref docCounter); if (docCounter % 100 == 0) { System.Console.WriteLine(String.Format("Completed {0} docs in {1} min...", docCounter, DateTime.Now.Subtract(stTime).TotalMinutes)); } schema = new SearchIndexSchema(); string sasURL = GetBlobSasUri(blobContainer, file); Uri uri = new Uri(sasURL); var result = textExtractor.Extract(uri); var blobMetadata = result.Metadata; schema.content = result.Text; schema.metadata_storage_name = blobContainer.Uri.ToString() + "/" + file; schema.metadata_storage_path = HttpServerUtility.UrlTokenEncode(Encoding.ASCII.GetBytes(schema.metadata_storage_name)); schema.metadata_content_type = blobMetadata.ContainsKey("Content-Type") == false ? null : blobMetadata["Content-Type"]; schema.metadata_last_modified = blobMetadata.ContainsKey("Last-Modified") == false ? DateTime.Parse("1900-01-01") : DateTimeOffset.Parse(blobMetadata["Last-Modified"]); schema.metadata_word_count = Utilities.WordCount(schema.content); schema.metadata_storage_size = Convert.ToInt32(blobMetadata.ContainsKey("Content-Length") == false ? null : blobMetadata["Content-Length"]); schema.metadata_character_count = schema.content.Length; schema.metadata_author = blobMetadata.ContainsKey("dc:creator") == false ? "" : blobMetadata["dc:creator"]; CloudBlockBlob blockBlob = blobContainer.GetBlockBlobReference(file); blockBlob.FetchAttributes(); schema.metadata_storage_content_type = blockBlob.Properties.ContentType; schema.metadata_storage_content_md5 = blockBlob.Properties.ContentMD5; schema.metadata_storage_last_modified = DateTimeOffset.Parse(blockBlob.Properties.LastModified.Value.DateTime.ToString()); schema.metadata_storage_size = blockBlob.StreamWriteSizeInBytes; var action = IndexAction.Upload(schema); indexActionList.Add(action); //Console.WriteLine("Indexing Counter: " + docCounter); indexClient.Documents.Index(IndexBatch.New(indexActionList)); } catch (IndexBatchException ibe) { // Sometimes when your Search service is under load, indexing will fail for some of the documents in // the batch. Depending on your application, you can take compensating actions like delaying and // retrying. For this simple demo, we just log the failed document keys and continue. Console.WriteLine( "Failed to index some of the documents: {0}", String.Join(", ", ibe.IndexingResults.Where(r => !r.Succeeded).Select(r => r.Key))); } catch (Exception ex) { // Sometimes when your Search service is under load, indexing will fail for some of the documents in // the batch. Depending on your application, you can take compensating actions like delaying and // retrying. For this simple demo, we just log the failed document keys and continue. Console.WriteLine(ex.Message); } indexActionList.Clear(); } }); System.Console.WriteLine(String.Format("Completed {0} docs in {1} min", FilesToProcess.Count, DateTime.Now.Subtract(stTime).TotalMinutes)); }
private void ExtractItems() { bool extractText = false; bool extractImgs = false; bool extractVideo = false; bool extractSharedItem = false; switch (momentInfo.momentType) { case MomentType.TextOnly: extractText = true; break; case MomentType.WithImg: extractText = true; extractImgs = true; break; case MomentType.WithShortVideo: extractText = true; extractVideo = true; break; case MomentType.Shared: extractText = true; extractSharedItem = true; break; } for (int i = 0; i < _object.Count; i++) { Type currentNodeType = _object[i].GetType(); if (currentNodeType == typeof(NSDictionary)) { NSDictionary currentItem = (NSDictionary)_object[i]; if (currentItem.ContainsKey("bDeleted"))//评论或点赞 { DateTime time = TimeUtil.TimeStamp2Datetime(currentItem["createTime"].ToString()); string type = currentItem["type"].ToString(); if (type.ToString() == "1")//点赞 { IExtractor <LikedInfo> likeExtractor = new LikeExtractor(_object, i, time); likes.Add(likeExtractor.Extract()); } else if (type.ToString() == "2")//评论 { IExtractor <CommentInfo> commentExtractor = new CommentExtractor(_object, i, time); CommentInfo comment = commentExtractor.Extract(); if (comment != null) { comments.Add(comment); } } } else if (extractText && currentItem.ContainsValue("WCAppInfo"))//文字 { IExtractor <string> textExtractor = new TextExtractor(_object, i, momentInfo.posterInfo); momentInfo.momentText = textExtractor.Extract(); } else if (extractImgs && currentItem.ContainsKey("encIdx"))//附图 { IExtractor <Uri> imgExtractor = new ImgExtractor(_object, i); Uri imgUrl = imgExtractor.Extract(); if (imgUrl != null) { imgs.Add(imgUrl); } } else if (extractVideo && currentItem.ContainsValue("WCUrl"))//视频 { IExtractor <Uri> shortVideoExtractor = new ShortVideoExtractor(_object, i); momentInfo.shortVideoUrl = shortVideoExtractor.Extract(); } //else if (extractSharedItem)//分享 //{ // IExtractor<SharedItem> sharedExtractor = new SharedExtractor(_object, i); // momentInfo.sharedItem = sharedExtractor.Extract(); //} } else if (extractSharedItem && currentNodeType == typeof(NSString)) { NSString currentItem = (NSString)_object[i]; if (currentItem.Content == "WeChat Sight")//微视分享 { IExtractor <SharedItem> weishiSharedExtractor = new WeishiSharedExtractor(_object, i); momentInfo.sharedItem = weishiSharedExtractor.Extract(); } } } }
private List<string> ExtractPageList(Document document) { // extract page list List<string> pageList = new List<string>(); TextExtractor textExtractor = new TextExtractor(); try { foreach (var page in document.Pages) { var textStrings = textExtractor.Extract(page); string pageContent = TextExtractor.ToString(textStrings); //string[] ssize = content.Split(null); // splits by whitespace pageList.Add(pageContent); } } catch (Exception e) { Utility.Log("Blad"); } return pageList; }
public void non_existing_files_should_fail_with_exception() { const string fileName = "files/doesnotexist.mp3"; typeof(TextExtractionException).ShouldBeThrownBy(() => _cut.Extract(fileName)) .Message.ShouldContain(fileName); }
public void FixPageRef(string outputPath, string pdfFilePath, string footerPrefix = "Belastingblad 2014/") { var mainSection = new InputData(Path.Combine(outputPath, "Section0000.html")); var body = mainSection.Document.Descendants().First(e => "body".Equals(e.Name.LocalName)); var divs = body.Elements() .Where( e => "div".Equals(e.Name.LocalName) && e.Attribute("class") != null && "hftekst".Equals(e.Attribute("class").Value)); Dictionary <int, string> pdfPages = TextExtractor.Extract(pdfFilePath); var collected = new HashSet <int>(); foreach (var div in divs) { foreach (var level1 in div.Elements()) { if (level1.Elements().Any(e => "div".Equals(e.Name.LocalName) && e.Attribute("class") != null && ("kopgegp".Equals(e.Attribute("class").Value) || "hftekst".Equals(e.Attribute("class").Value) || "auteurgeg".Equals(e.Attribute("class").Value)))) { foreach (var level2 in level1.Elements()) { if (level2.Elements().Any(e => "div".Equals(e.Name.LocalName) && e.Attribute("class") != null && ("kopgegp".Equals(e.Attribute("class").Value) || "hftekst".Equals(e.Attribute("class").Value) || "auteurgeg".Equals(e.Attribute("class").Value)))) { foreach (var level3 in level2.Elements()) { if (level3.Elements().Any(e => "div".Equals(e.Name.LocalName) && e.Attribute("class") != null && ("kopgegp".Equals(e.Attribute("class").Value) || "hftekst".Equals(e.Attribute("class").Value) || "auteurgeg".Equals(e.Attribute("class").Value)))) { foreach (var level4 in level3.Elements()) { string val = ValueOfElement(level4); foreach ( var pdfPage in pdfPages.Where(e => !collected.Contains(e.Key) && e.Key > 2)) { string[] lines = pdfPage.Value.Split('\n'); int lastLinesHasBtwBrief = lines.Count() - 1; while (lastLinesHasBtwBrief > -1) { if (lines[lastLinesHasBtwBrief].Contains(footerPrefix)) { break; } lastLinesHasBtwBrief--; } if (lastLinesHasBtwBrief + 1 == lines.Count()) { continue; } string firstSentence = Regex.Replace(lines[lastLinesHasBtwBrief + 1].ToLower(), @"\s+", "") .TrimEnd('-'); while (firstSentence.Length <= 40) { firstSentence = Regex.Replace(lines[lastLinesHasBtwBrief++ + 1].ToLower(), @"\s+", "") .TrimEnd('-'); if (lastLinesHasBtwBrief == lines.Count() - 1) { break; } } if (val.Contains(firstSentence)) { //level1.Add(new XElement("pageref", pdfPage.Key)); var pagerefdiv = new XElement("div", new XAttribute("class", "pagerefdiv")); var a = new XElement("a", new XAttribute("class", "pcalibre pageref pcalibre1"), new XAttribute("href", "#backpageref_" + pdfPage.Key), new XAttribute("id", "pageref_" + pdfPage.Key), new XText(pdfPage.Key.ToString())); pagerefdiv.Add(a); level4.Add(pagerefdiv); collected.Add(pdfPage.Key); break; } } } } else { string val = ValueOfElement(level3); foreach ( var pdfPage in pdfPages.Where(e => !collected.Contains(e.Key) && e.Key > 2)) { string[] lines = pdfPage.Value.Split('\n'); int lastLinesHasBtwBrief = lines.Count() - 1; while (lastLinesHasBtwBrief > -1) { if (lines[lastLinesHasBtwBrief].Contains(footerPrefix)) { break; } lastLinesHasBtwBrief--; } if (lastLinesHasBtwBrief + 1 == lines.Count()) { continue; } string firstSentence = Regex.Replace(lines[lastLinesHasBtwBrief + 1].ToLower(), @"\s+", "") .TrimEnd('-'); while (firstSentence.Length <= 40) { firstSentence = Regex.Replace(lines[lastLinesHasBtwBrief++ + 1].ToLower(), @"\s+", "") .TrimEnd('-'); if (lastLinesHasBtwBrief == lines.Count() - 1) { break; } } if (val.Contains(firstSentence)) { //level1.Add(new XElement("pageref", pdfPage.Key)); var pagerefdiv = new XElement("div", new XAttribute("class", "pagerefdiv")); var a = new XElement("a", new XAttribute("class", "pcalibre pageref pcalibre1"), new XAttribute("href", "#backpageref_" + pdfPage.Key), new XAttribute("id", "pageref_" + pdfPage.Key), new XText(pdfPage.Key.ToString())); pagerefdiv.Add(a); level3.Add(pagerefdiv); collected.Add(pdfPage.Key); break; } } } } } else { string val = ValueOfElement(level2); foreach (var pdfPage in pdfPages.Where(e => !collected.Contains(e.Key) && e.Key > 2)) { string[] lines = pdfPage.Value.Split('\n'); int lastLinesHasBtwBrief = lines.Count() - 1; while (lastLinesHasBtwBrief > -1) { if (lines[lastLinesHasBtwBrief].Contains(footerPrefix)) { break; } lastLinesHasBtwBrief--; } if (lastLinesHasBtwBrief + 1 == lines.Count()) { continue; } string firstSentence = Regex.Replace(lines[lastLinesHasBtwBrief + 1].ToLower(), @"\s+", "") .TrimEnd('-'); while (firstSentence.Length <= 40) { firstSentence = Regex.Replace(lines[lastLinesHasBtwBrief++ + 1].ToLower(), @"\s+", "") .TrimEnd('-'); if (lastLinesHasBtwBrief == lines.Count() - 1) { break; } } if (val.Contains(firstSentence)) { //level1.Add(new XElement("pageref", pdfPage.Key)); var pagerefdiv = new XElement("div", new XAttribute("class", "pagerefdiv")); var a = new XElement("a", new XAttribute("class", "pcalibre pageref pcalibre1"), new XAttribute("href", "#backpageref_" + pdfPage.Key), new XAttribute("id", "pageref_" + pdfPage.Key), new XText(pdfPage.Key.ToString())); pagerefdiv.Add(a); level2.Add(pagerefdiv); collected.Add(pdfPage.Key); break; } } } } } else { string val = ValueOfElement(level1); foreach (var pdfPage in pdfPages.Where(e => !collected.Contains(e.Key) && e.Key > 2)) { string[] lines = pdfPage.Value.Split('\n'); int lastLinesHasBtwBrief = lines.Count() - 1; while (lastLinesHasBtwBrief > -1) { if (lines[lastLinesHasBtwBrief].Contains(footerPrefix)) { break; } lastLinesHasBtwBrief--; } if (lastLinesHasBtwBrief + 1 == lines.Count()) { continue; } string firstSentence = Regex.Replace(lines[lastLinesHasBtwBrief + 1].ToLower(), @"\s+", "").TrimEnd('-'); while (firstSentence.Length <= 40) { firstSentence = Regex.Replace(lines[lastLinesHasBtwBrief++ + 1].ToLower(), @"\s+", "").TrimEnd('-'); if (lastLinesHasBtwBrief == lines.Count() - 1) { break; } } if (val.Contains(firstSentence)) { //level1.Add(new XElement("pageref", pdfPage.Key)); var pagerefdiv = new XElement("div", new XAttribute("class", "pagerefdiv")); var a = new XElement("a", new XAttribute("class", "pcalibre pageref pcalibre1"), new XAttribute("href", "#backpageref_" + pdfPage.Key), new XAttribute("id", "pageref_" + pdfPage.Key), new XText(pdfPage.Key.ToString())); pagerefdiv.Add(a); level1.Add(pagerefdiv); collected.Add(pdfPage.Key); break; } } } } } var divcolofon = new XElement("div", new XAttribute("class", "colofon")); var h2 = new XElement("h2", new XAttribute("class", "kopgeghftekst")); var span = new XElement("span", new XAttribute("class", "hfteksttitel"), new XAttribute("id", "colofon"), new XText("Colofon")); var img = new XElement("img", new XAttribute("src", "../Images/colofon.jpg"), new XAttribute("alt", ""), new XAttribute("class", "calibre7")); h2.Add(span); divcolofon.Add(h2, img); body.Add(divcolofon); var d = new XElement("div", new XAttribute("class", "hftekst")); var a1 = new XElement("a", new XAttribute("href", "../Text/Inhoud.html#paginaregister"), new XAttribute("id", "paginaregister"), new XAttribute("class", "vindplaats1"), new XText(" Paginaregister ")); d.Add(a1); foreach (var i in collected) { var spanpagereglinkdiv = new XElement("span", new XAttribute("class", "pagereglinkdiv")); var apagereglinkdiv = new XElement("a", new XAttribute("class", "pagereglink"), new XAttribute("href", "#pageref_" + i), new XAttribute("id", "backpageref_" + i), new XText(i.ToString())); spanpagereglinkdiv.Add(apagereglinkdiv); d.Add(spanpagereglinkdiv); } body.Add(d); mainSection.Document.Write(Path.Combine(outputPath, "fixRefSection0000.html"), Formatting.Indented); }
public void TestTranslateScaliger() { var te = new TextExtractor(); var pdfContents = te.Extract(@"Iosephi_Scaligeri_Opus_de_emendatione_te.pdf"); Assert.IsNotNull(pdfContents); var lines = FormatLines(pdfContents); Assert.IsTrue(lines.Length > 10); var pages = new List <string>(); var output = new StringBuilder(); foreach (var line in lines) { if (output.Length + line.Length < 5000) { output.Append(line); } else { pages.Add(output.ToString()); output.Clear(); } } if (output.Length > 0) { pages.Add(output.ToString()); } Assert.IsNotNull(pages[0]); var translatedText = new List <string>(); TranslationClient client = TranslationClient.Create(); int startPage = 162; for (int i = startPage; i < pages.Count; i++) { var page = pages[i]; try { if (page.Length > 0) { //var response = client.TranslateText(page.Replace("fign","sign").Replace("ff", "ss").Replace("bf", "bs").Replace("fs", "ss").Replace("fc", "sc").Replace("ft", "st").Replace("fol", "sol"), "en"); //translatedText.Add(response.TranslatedText); //System.IO.File.WriteAllText("Translation\\de_emendatione_temporum_translated_" + i, response.TranslatedText); //System.Threading.Thread.Sleep(5000); } } catch (Exception) { } } var stream = System.IO.File.AppendText("de_emendatione_temporum_translated.txt"); for (int i = startPage; i < translatedText.Count; i++) { var page = translatedText[i]; stream.WriteLine("Page " + (i + 1)); stream.WriteLine(); stream.WriteLine(page); stream.WriteLine(); } }
public void TestExtractMethodWithUndefinedFile() { Assert.ThrowsException <ArgumentNullException>( () => TextExtractor.Extract((string)null) ); }