예제 #1
1
        public override void Run(
            )
        {
            // 1. Opening the PDF file...
              string filePath = PromptFileChoice("Please select a PDF file");
              using(File file = new File(filePath))
              {
            Document document = file.Document;

            // 2. Text extraction from the document pages.
            TextExtractor extractor = new TextExtractor();
            foreach(Page page in document.Pages)
            {
              if(!PromptNextPage(page, false))
              {
            Quit();
            break;
              }

              IList<ITextString> textStrings = extractor.Extract(page)[TextExtractor.DefaultArea];
              foreach(ITextString textString in textStrings)
              {
            RectangleF textStringBox = textString.Box.Value;
            Console.WriteLine(
              "Text ["
                + "x:" + Math.Round(textStringBox.X) + ","
                + "y:" + Math.Round(textStringBox.Y) + ","
                + "w:" + Math.Round(textStringBox.Width) + ","
                + "h:" + Math.Round(textStringBox.Height)
                + "]: " + textString.Text
                );
              }
            }
              }
        }
예제 #2
1
        public override void Run(
            )
        {
            // 1. Opening the PDF file...
              string filePath = PromptFileChoice("Please select a PDF file");
              using(File file = new File(filePath))
              {
            // Define the text pattern to look for!
            string textRegEx = PromptChoice("Please enter the pattern to look for: ");
            Regex pattern = new Regex(textRegEx, RegexOptions.IgnoreCase);

            // 2. Iterating through the document pages...
            TextExtractor textExtractor = new TextExtractor(true, true);
            foreach(Page page in file.Document.Pages)
            {
              Console.WriteLine("\nScanning page " + (page.Index+1) + "...\n");

              // 2.1. Extract the page text!
              IDictionary<RectangleF?,IList<ITextString>> textStrings = textExtractor.Extract(page);

              // 2.2. Find the text pattern matches!
              MatchCollection matches = pattern.Matches(TextExtractor.ToString(textStrings));

              // 2.3. Highlight the text pattern matches!
              textExtractor.Filter(
            textStrings,
            new TextHighlighter(page, matches)
            );
            }

            // 3. Highlighted file serialization.
            Serialize(file);
              }
        }
예제 #3
0
        public Files tika_metadata()
        {
            var result = _cut.Extract(file);
            var files  = new Files()
            {
                Id           = Path.GetFileName(file),
                ContentType  = result.ContentType,
                Body         = result.Text,
                FilePath     = Path.GetDirectoryName(file),
                CreatedDate  = File.GetCreationTime(file),
                ModifiedDate = File.GetLastWriteTime(file)
            };

            if (result.Metadata.ContainsKey("title"))
            {
                files.Title = result.Metadata["title"];
            }
            else
            {
                files.Title = file;
            }
            if (result.Metadata.ContainsKey("Content-Length"))
            {
                long longtest;
                if (long.TryParse(result.Metadata["Content-Length"], out longtest))
                {
                    files.ContentLength = longtest;
                }
            }
            if (result.Metadata.ContainsKey("Author"))
            {
                files.Author = result.Metadata["Author"];
            }

            return(files);
        }
예제 #4
0
        public override void Run(
            )
        {
            // 1. Opening the PDF file...
            string filePath = PromptFileChoice("Please select a PDF file");

            using (var file = new File(filePath))
            {
                Document document = file.Document;

                // 2. Text extraction from the document pages.
                TextExtractor extractor = new TextExtractor();
                foreach (Page page in document.Pages)
                {
                    if (!PromptNextPage(page, false))
                    {
                        Quit();
                        break;
                    }

                    IList <ITextString> textStrings = extractor.Extract(page)[TextExtractor.DefaultArea];
                    foreach (ITextString textString in textStrings)
                    {
                        RectangleF textStringBox = textString.Box.Value;
                        Console.WriteLine(
                            "Text ["
                            + "x:" + Math.Round(textStringBox.X) + ","
                            + "y:" + Math.Round(textStringBox.Y) + ","
                            + "w:" + Math.Round(textStringBox.Width) + ","
                            + "h:" + Math.Round(textStringBox.Height)
                            + "]: " + textString.Text
                            );
                    }
                }
            }
        }
 public void TestExtractMethodWithUndefinedBytes()
 {
     Assert.ThrowsException <ArgumentNullException>(
         () => TextExtractor.Extract((byte[])null)
         );
 }
 /// <summary>
 /// Extracts data from a file asynchronously.
 /// </summary>
 /// <param name="extractor">The extractor.</param>
 /// <param name="rawData">The raw data.</param>
 /// <returns></returns>
 public static Task <TextExtractionResult> ExtractAsync(this TextExtractor extractor, byte[] rawData) => Task.Run(() => extractor.Extract(rawData));
 public void TestExtractMethodWithUndefinedStream()
 {
     Assert.ThrowsException <ArgumentNullException>(
         () => TextExtractor.Extract((Stream)null)
         );
 }
        public ActionResult Upload()
        {
            if (Request.Files.Count > 0)
            {
                var file = Request.Files[0];

                if (file != null && file.ContentLength > 0)
                {
                    var      fileName         = Path.GetFileName(file.FileName);
                    string[] fileNameSplit    = fileName.Split('.');
                    var      myUniqueFileName = string.Format(@"{0}.{1}", DateTime.Now.Ticks, fileNameSplit[1]);
                    var      path             = Path.Combine(Server.MapPath("~/Files/"), myUniqueFileName);//TO DO change path
                    file.SaveAs(path);

                    string author      = "";
                    string title       = "";
                    string description = "";
                    if (fileNameSplit[1] != "pdf")
                    {
                        var textExtractor   = new TextExtractor();
                        var wordDocContents = textExtractor.Extract(path);
                        if (!wordDocContents.Metadata.TryGetValue("Author", out author))
                        {
                            author = "";
                        }
                        if (!wordDocContents.Metadata.TryGetValue("title", out title))
                        {
                            title = "";
                        }
                        if (!wordDocContents.Metadata.TryGetValue("description", out description))
                        {
                            description = "";
                        }

                        Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
                        Lucene.Net.Store.Directory directory = FSDirectory.Open(@"C:\Users\Totalit\Source\Repos\FilesAnalyzer\FilesAnalyzer\FilesAnalyzer\IndexedFiles");
                        IndexWriter writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
                        Document    doc    = new Document();

                        doc.Add(new Field("Author",
                                          author,
                                          Field.Store.YES,
                                          Field.Index.ANALYZED));
                        doc.Add(new Field("title",
                                          title,
                                          Field.Store.YES,
                                          Field.Index.ANALYZED));
                        doc.Add(new Field("description",
                                          description,
                                          Field.Store.YES,
                                          Field.Index.ANALYZED));
                        doc.Add(new Field("text",
                                          wordDocContents.Text,
                                          Field.Store.YES,
                                          Field.Index.ANALYZED));


                        writer.AddDocument(doc);
                        writer.Optimize();
                        writer.Close();
                    }
                    else
                    {
                        // TO DO html version
                    }
                }
            }

            return(View());
        }
예제 #9
0
        public override void Run(
            )
        {
            // 1. Opening the PDF file...
            string filePath = PromptFileChoice("Please select a PDF file");

            using (files::File file = new files::File(filePath))
            {
                Document document = file.Document;

                // 2. Link extraction from the document pages.
                TextExtractor extractor = new TextExtractor();
                extractor.AreaTolerance = 2; // 2 pt tolerance on area boundary detection.
                bool linkFound = false;
                foreach (Page page in document.Pages)
                {
                    if (!PromptNextPage(page, !linkFound))
                    {
                        Quit();
                        break;
                    }

                    IDictionary <RectangleF?, IList <ITextString> > textStrings = null;
                    linkFound = false;

                    // Get the page annotations!
                    PageAnnotations annotations = page.Annotations;
                    if (!annotations.Exists())
                    {
                        Console.WriteLine("No annotations here.");
                        continue;
                    }

                    // Iterating through the page annotations looking for links...
                    foreach (Annotation annotation in annotations)
                    {
                        if (annotation is Link)
                        {
                            linkFound = true;

                            if (textStrings == null)
                            {
                                textStrings = extractor.Extract(page);
                            }

                            Link       link    = (Link)annotation;
                            RectangleF linkBox = link.Box;

                            // Text.

                            /*
                             * Extracting text superimposed by the link...
                             * NOTE: As links have no strong relation to page text but a weak location correspondence,
                             * we have to filter extracted text by link area.
                             */
                            StringBuilder linkTextBuilder = new StringBuilder();
                            foreach (ITextString linkTextString in extractor.Filter(textStrings, linkBox))
                            {
                                linkTextBuilder.Append(linkTextString.Text);
                            }
                            Console.WriteLine("Link '" + linkTextBuilder + "' ");

                            // Position.
                            Console.WriteLine(
                                "    Position: "
                                + "x:" + Math.Round(linkBox.X) + ","
                                + "y:" + Math.Round(linkBox.Y) + ","
                                + "w:" + Math.Round(linkBox.Width) + ","
                                + "h:" + Math.Round(linkBox.Height)
                                );

                            // Target.
                            Console.Write("    Target: ");
                            PdfObjectWrapper target = link.Target;
                            if (target is Destination)
                            {
                                PrintDestination((Destination)target);
                            }
                            else if (target is actions::Action)
                            {
                                PrintAction((actions::Action)target);
                            }
                            else if (target == null)
                            {
                                Console.WriteLine("[not available]");
                            }
                            else
                            {
                                Console.WriteLine("[unknown type: " + target.GetType().Name + "]");
                            }
                        }
                    }
                    if (!linkFound)
                    {
                        Console.WriteLine("No links here.");
                        continue;
                    }
                }
            }
        }
예제 #10
0
        public void should_extract_mp4()
        {
            var textExtractionResult = _cut.Extract("files/badgers.mp4");

            textExtractionResult.ContentType.ShouldEqual("video/mp4");

            var fileInfo = new FileInfo(@"C:\projects\tikaondotnet\src\TikaOnDotNet.Tests\bin\Debug\files\badgers.mp4");

            fileInfo.Delete();
            fileInfo.Exists.ShouldBeFalse();
        }
 public void TestExtractMethodWithMissingBytes()
 {
     Assert.ThrowsException <NotSupportedException>(
         () => TextExtractor.Extract(new byte[] { })
         );
 }
예제 #12
0
        public void non_existing_files_should_fail_with_exception()
        {
            const string fileName = "files/doesnotexist.mp3";


            Action act = () => _cut.Extract(fileName);

            act.ShouldThrow <TextExtractionException>()
            .Which.Message.Should().Contain(fileName);
        }
예제 #13
0
        private TextExtractionResult ExtractFromCdn(string cdn)
        {
            var textExtractResult = _cut.Extract(new Uri(cdn));

            return(textExtractResult);
        }
예제 #14
0
 static void Main(string[] args)
 {
     Dictionary <int, string> pdfPages = TextExtractor.Extract(args[0]);
 }
        public void TestExtractMethodWithValidFileWithWhitespacePreserveFile()
        {
            var text = TextExtractor.Extract(@"Documents\ValidWithWhitespacePreserve.docx");

            Assert.IsTrue(text == "This is a Word document.");
        }
 public void TestExtractMethodWithInvalidFile()
 {
     Assert.ThrowsException <NotSupportedException>(
         () => TextExtractor.Extract(@"Documents\Invalid.docx")
         );
 }
 public void TestExtractMethodWithMissingFile()
 {
     Assert.ThrowsException <FileNotFoundException>(
         () => TextExtractor.Extract(string.Empty)
         );
 }
예제 #18
0
        public XDocument FixPageRef(string hfccFilePath, string pdfFilePath, string footerPrefix)
        {
            var hfcc = new InputData(hfccFilePath);
            IEnumerable <XElement> sections = SplitSection(hfcc);

            if (sections.Any())
            {
                Dictionary <int, string> pdfPages = TextExtractor.Extract(pdfFilePath);
                var collected = new HashSet <int>();

                foreach (XElement section in sections)
                {
                    XElement kenmerkgrp = section.Element("kenmerkgrp");
                    if (kenmerkgrp == null)
                    {
                        continue;
                    }
                    XElement kenmerk = kenmerkgrp.Element("kenmerk");
                    if (kenmerk == null)
                    {
                        continue;
                    }
                    XElement commentaarcontent = section.Element("commentaarcontent");
                    if (commentaarcontent == null)
                    {
                        continue;
                    }
                    XElement verhandelingalgemeen = commentaarcontent.Element("verhandelingalgemeen");
                    if (verhandelingalgemeen == null)
                    {
                        continue;
                    }

                    string query = footerPrefix + kenmerk.Value.Trim();

                    var pagesNo = new Dictionary <int, string>();
                    foreach (var pdfPage in pdfPages)
                    {
                        if (pdfPage.Value.Contains(query))
                        {
                            pagesNo[pdfPage.Key] = pdfPage.Value;
                        }
                    }
                    XElement samenvatting = verhandelingalgemeen.Element("samenvatting");
                    foreach (var pageNo in pagesNo)
                    {
                        string[] lines = pageNo.Value.Split('\n');
                        int      lastLinesHasBtwBrief = lines.Count() - 1;
                        while (lastLinesHasBtwBrief > -1)
                        {
                            if (lines[lastLinesHasBtwBrief].Contains(query))
                            {
                                break;
                            }
                            lastLinesHasBtwBrief--;
                        }
                        if (lastLinesHasBtwBrief + 1 == lines.Count())
                        {
                            continue;
                        }
                        string firstSentence =
                            lines[lastLinesHasBtwBrief + 1].Trim().TrimEnd('-').Replace(" ", "").ToLower();
                        if (samenvatting != null &&
                            ValueOfElement(samenvatting).Contains(firstSentence))
                        {
                            samenvatting.Add(new XElement("a", new XAttribute("type", "pageref"), pageNo.Key));
                            collected.Add(pageNo.Key);
                        }
                        else
                        {
                            bool found = false;
                            IEnumerable <XElement> commentaarcontentabloks = verhandelingalgemeen.Elements("ablok");
                            foreach (XElement commentaarcontentablok in commentaarcontentabloks)
                            {
                                if (ValueOfElement(commentaarcontentablok).Contains(firstSentence))
                                {
                                    commentaarcontentablok.Add(new XElement("a", new XAttribute("type", "pageref"),
                                                                            pageNo.Key));
                                    collected.Add(pageNo.Key);
                                    found = true;
                                    break;
                                }
                            }
                            if (!found)
                            {
                                IEnumerable <XElement> ps = verhandelingalgemeen.Elements("p");
                                bool b = false;
                                foreach (XElement p in ps)
                                {
                                    IEnumerable <XElement> pAbloks =
                                        p.Descendants().Where(e => "ablok".Equals(e.Name.LocalName));
                                    foreach (XElement pAblok in pAbloks)
                                    {
                                        if (ValueOfElement(pAblok).Contains(firstSentence))
                                        {
                                            pAblok.Add(new XElement("a", new XAttribute("type", "pageref"), pageNo.Key));
                                            collected.Add(pageNo.Key);
                                            b = true;
                                            break;
                                        }
                                    }
                                    if (b)
                                    {
                                        break;
                                    }
                                }
                            }
                        }
                    }
                }

                IEnumerable <int> outer = pdfPages.Keys.Where(k => !collected.Contains(k));
                if (outer.Any())
                {
                    Console.WriteLine("Couldn't add page ref: " + string.Join(", ", outer));
                    Debug.WriteLine("Couldn't add page ref: " + string.Join(", ", outer));
                }
            }
            return(hfcc.Document);
        }
        public List <Transaction> RunOld(int startPage, int endPage, bool usePageNum = false)
        {
            if (endPage <= startPage)
            {
                throw new Exception("End page <= Start page.");
            }

            var startIdx     = startPage - 1;
            var cnt          = endPage - startPage;
            var textStrings  = new List <string>();
            var transactions = new List <Transaction>();

            // 1. Opening the PDF file...
            using (File file = new File(filePath))
            {
                var document = file.Document;

                //if (usePageNum)
                //{
                //if (endPage >= document.Pages.Count)
                //{
                //   throw new Exception("Endpage > total pages.");
                // }
                //return UsePageNum(startIdx, cnt, document);
                // }
                // 2. Text extraction from the document pages.
                var extractor     = new TextExtractor();
                var foundAccount  = false;
                var start         = false;
                var pgCnt         = 0;
                var lastAccntPage = -1;
                foreach (var page in document.Pages)
                {
                    pgCnt++;
                    //if (!PromptNextPage(page, false))
                    //{
                    //  break;
                    //}
                    IList <ITextString> pageTextStrings = new List <ITextString>();
                    try
                    {
                        pageTextStrings = extractor.Extract(page)[TextExtractor.DefaultArea];
                    }
                    catch
                    {
                        continue;
                    }
                    if (pageTextStrings == null || pageTextStrings.Count == 0)
                    {
                        continue;
                    }

                    if (lastAccntPage == -1 && !pageTextStrings.Any(x => x.Text.Contains(account)))
                    {
                        continue;
                    }
                    if (lastAccntPage > -1 && pgCnt - 1 != lastAccntPage)
                    {
                        continue;
                    }
                    lastAccntPage = pgCnt;
                    Transaction transactionNowAndPrevious = null;
                    foreach (ITextString textString in pageTextStrings)
                    {
                        if (textString.Text.Contains(account))
                        {
                            foundAccount = true;
                            continue;
                        }
                        if (textString.Text.Contains(eof))
                        {
                            foundAccount = false;
                            break;
                        }
                        if (!foundAccount)
                        {
                            continue;
                        }

                        if (textString.Text.Contains(flag))
                        {
                            start = true;
                            continue;
                        }
                        if (textString.Text.Contains(flag1))
                        {
                            start = true;
                            continue;
                        }
                        if (textString.Text.Contains(flag2))
                        {
                            start = true;
                            continue;
                        }
                        if (start && textString.Text.Contains("Beginning"))
                        {
                            continue;
                        }

                        var  line = textString.Text.Trim().Split(' ');
                        long test;
                        if (line.Length == 1 && !string.IsNullOrEmpty(line[0]) && long.TryParse(line[0], out test))
                        {
                            start = false;
                        }
                        if (!start)
                        {
                            continue;
                        }



                        decimal amnt     = 0;
                        decimal balance  = 0;
                        var     dateTest = DateTime.MinValue;
                        if (line.Length > 1 && decimal.TryParse(line[line.Length - 2], out amnt) && decimal.TryParse(line[line.Length - 1], out balance))
                        {
                            var desc = line.ToList();
                            DateTime.TryParse(line[0], out dateTest);
                            desc.RemoveAt(line.Length - 1);
                            desc.RemoveAt(line.Length - 2);
                            desc.RemoveAt(0);
                            desc = desc.Select(x => x.Trim().Replace(',', '|')).ToList();

                            transactionNowAndPrevious = new Transaction
                            {
                                Amount      = amnt,
                                Date        = dateTest,
                                Description = string.Join(" ", desc),
                                Misc        = textString.Box.ToString(),
                                Type        = textString.TextChars.First(x => x.Virtual == true).Box.Width < 120
                                                 ? TransactionType.Deposit
                                                 : TransactionType.Withdrawal
                            };
                        }
                        else if (line.Length > 0 && decimal.TryParse(line[line.Length - 1], out amnt) && DateTime.TryParse(line[0], out dateTest))
                        {
                            // No balance
                            var desc = line.ToList();
                            desc.RemoveAt(line.Length - 1);
                            desc.RemoveAt(0);
                            desc = desc.Select(x => x.Trim().Replace(',', '|')).ToList();

                            transactionNowAndPrevious = new Transaction
                            {
                                Amount      = amnt,
                                Date        = dateTest,
                                Description = string.Join(" ", desc),
                                Misc        = textString.Box.ToString(),
                                Type        = textString.TextChars.First(x => x.Virtual == true).Box.Width < 120
                                                 ? TransactionType.Deposit
                                                 : TransactionType.Withdrawal
                            };
                        }
                        else
                        {
                            transactionNowAndPrevious              = transactionNowAndPrevious ?? new Transaction();
                            transactionNowAndPrevious.Description += $" {textString.Text}";
                            transactions.Add(transactionNowAndPrevious);
                        }
                    }
                }
            }
            return(transactions);
        }
예제 #20
0
 /// <summary>
 /// Adds words from the Specified Document to the specified Inverted Index Table
 /// </summary>
 /// <param name="doc">The document to be Tokenized.</param>
 /// <exception cref="TextExtractionException">Could not extract Files from the Document</exception>
 public static void AddFileFrom(Document doc)
 {
     String[] words = Semanter.Splitwords(x.Extract(doc.Address).Text);
     invt.AddDocument(words, doc);
 }
 public void TestExtractMethodWithMissingStream()
 {
     Assert.ThrowsException <NotSupportedException>(
         () => TextExtractor.Extract(new MemoryStream())
         );
 }
예제 #22
0
        /// <summary>
        /// 指定したテキスト抽出器でテキスト化したものをインデックス化
        /// テキスト抽出器の種類は以下のとおり
        ///  ・Apache Tika
        ///  ・IFilter
        /// </summary>
        /// <param name="path"></param>
        /// <param name="indexWriter"></param>
        private bool AddDocument(string path, IndexWriter indexWriter, string threadName, Dictionary <string, DocInfo> docDic)
        {
            string   filename  = System.IO.Path.GetFileName(path);
            string   extension = System.IO.Path.GetExtension(path);
            FileInfo fi        = new FileInfo(path);

            if (extension == "" ||
                !_targetExtensionDic.ContainsKey(extension.ToLower()))
            {
                //拡張子なし or 対象拡張子外
                AppObject.Logger.Info(threadName + ":" + "Out of target extension. Skipped: " + path);
                Interlocked.Increment(ref _skippedCount);

                return(false);
            }
            if (extension.ToLower() != ".mp4" && fi.Length > this.FileSizeLimit)
            {
                //サイズオーバー(mp4は対象外)
                AppObject.Logger.Info(threadName + ":" + "File size over. Skipped: " + path);
                Interlocked.Increment(ref _skippedCount);

                return(false);
            }
            //存在するドキュメントか?
            if (docDic != null && docDic.ContainsKey(path))
            {
                DocInfo di = docDic[path];
                di.Exists    = true;
                docDic[path] = di;
                //更新日時チェック(秒単位で比較)
                if (di.UpdateDate < DateTimeUtil.Truncate(fi.LastWriteTime, TimeSpan.FromSeconds(1)))
                {
                    //更新されている場合Delete+Insert
                    Term t = new Term(LuceneIndexBuilder.Path, di.Path);
                    indexWriter.DeleteDocuments(t);
                }
                else
                {
                    //更新されていない。
                    AppObject.Logger.Info(threadName + ":" + "No updated. Skipped: " + path);
                    Interlocked.Increment(ref _skippedCount);

                    return(false);
                }
            }

            //ドキュメント追加
            Document doc = new Document();

            if (extension.ToLower() == ".md")
            {
                //Markdown形式
                string content = ReadToString(path);
                doc.Add(new Field(Content, content, _hilightFieldType));
            }
            else if (extension.ToLower() == ".txt")
            {
                //TXTファイル
                var sjis = Encoding.GetEncoding("Shift_JIS");
                if (FileUtil.GetTextEncoding(path) == sjis)
                {
                    string content = "";
                    using (var reader = new StreamReader(path, sjis)) {
                        content = reader.ReadToEnd();
                    }
                    doc.Add(new Field(Content, content, _hilightFieldType));
                }
                else
                {
                    if (_txtExtractMode == TextExtractModes.Tika)
                    {
                        var content = _txtExtractor.Extract(path);
                        doc.Add(new Field(Content, content.Text, _hilightFieldType));
                    }
                    else
                    {
                        doc.Add(new Field(Content, IFilterParser.Parse(path), _hilightFieldType));
                    }
                }
            }
            else
            {
                if (_txtExtractMode == TextExtractModes.Tika)
                {
                    var content = _txtExtractor.Extract(path);
                    doc.Add(new Field(Content, content.Text, _hilightFieldType));
                }
                else
                {
                    doc.Add(new Field(Content, IFilterParser.Parse(path), _hilightFieldType));
                }
            }

            doc.Add(new StringField(Path, path, FieldStore.YES));
            doc.Add(new StringField(Title, filename.ToLower(), FieldStore.YES));
            doc.Add(new StringField(Extension, extension.ToLower(), FieldStore.YES));
            //NOTE:Date型のFieldは存在しないのでlongで保持
            long l = long.Parse(fi.LastWriteTime.ToString("yyyyMMddHHmmss"));

            doc.Add(new LongPoint(UpdateDate, l));
            doc.Add(new StoredField(UpdateDate, l));
            //doc.Add(new StringField(UpdateDate,
            //    DateTools.DateToString(_sdf.parse(fi.LastWriteTime.ToString("yyyy/MM/dd")), DateToolsResolution.DAY),
            //    FieldStore.YES));
            indexWriter.AddDocument(doc);

            return(true);
        }
예제 #23
0
        public void should_extract_author_list_from_pdf()
        {
            var textExtractionResult = _cut.Extract("files/file_author.pdf", CreateCustomResult);

            textExtractionResult.Metadata["meta:author"].Should().ContainInOrder("Bernal, M. A.", "deAlmeida, C. E.", "Incerti, S.", "Champion, C.", "Ivanchenko, V.", "Francis, Z.");
        }
예제 #24
0
        public override void Run(
            )
        {
            // 1. Opening the PDF file...
              string filePath = PromptFileChoice("Please select a PDF file");
              using(files::File file = new files::File(filePath))
              {
            Document document = file.Document;

            // 2. Link extraction from the document pages.
            TextExtractor extractor = new TextExtractor();
            extractor.AreaTolerance = 2; // 2 pt tolerance on area boundary detection.
            bool linkFound = false;
            foreach(Page page in document.Pages)
            {
              if(!PromptNextPage(page, !linkFound))
              {
            Quit();
            break;
              }

              IDictionary<RectangleF?,IList<ITextString>> textStrings = null;
              linkFound = false;

              // Get the page annotations!
              PageAnnotations annotations = page.Annotations;
              if(!annotations.Exists())
              {
            Console.WriteLine("No annotations here.");
            continue;
              }

              // Iterating through the page annotations looking for links...
              foreach(Annotation annotation in annotations)
              {
            if(annotation is Link)
            {
              linkFound = true;

              if(textStrings == null)
              {textStrings = extractor.Extract(page);}

              Link link = (Link)annotation;
              RectangleF linkBox = link.Box;

              // Text.
              /*
                Extracting text superimposed by the link...
                NOTE: As links have no strong relation to page text but a weak location correspondence,
                we have to filter extracted text by link area.
              */
              StringBuilder linkTextBuilder = new StringBuilder();
              foreach(ITextString linkTextString in extractor.Filter(textStrings,linkBox))
              {linkTextBuilder.Append(linkTextString.Text);}
              Console.WriteLine("Link '" + linkTextBuilder + "' ");

              // Position.
              Console.WriteLine(
                "    Position: "
                  + "x:" + Math.Round(linkBox.X) + ","
                  + "y:" + Math.Round(linkBox.Y) + ","
                  + "w:" + Math.Round(linkBox.Width) + ","
                  + "h:" + Math.Round(linkBox.Height)
                  );

              // Target.
              Console.Write("    Target: ");
              PdfObjectWrapper target = link.Target;
              if(target is Destination)
              {PrintDestination((Destination)target);}
              else if(target is actions::Action)
              {PrintAction((actions::Action)target);}
              else if(target == null)
              {Console.WriteLine("[not available]");}
              else
              {Console.WriteLine("[unknown type: " + target.GetType().Name + "]");}
            }
              }
              if(!linkFound)
              {
            Console.WriteLine("No links here.");
            continue;
              }
            }
              }
        }
        public IEnumerable <Client> Deserialize()
        {
            const string matchIdExp       = @"[0-9]+";
            const string matchNameExp     = @"([^0-9]\w+\s)+";
            const string matchDiscountExp = @"[0-9]+[.][0-9]+[%]";

            Regex idRegex       = new Regex(matchIdExp);
            Regex nameRegex     = new Regex(matchNameExp);
            Regex discountRegex = new Regex(matchDiscountExp);
            Regex extractRegex  = new Regex(@"[0-9]+\s(\w+\s)+[0-9]+[.][0-9]+[%]");

            File file;

            try
            {
                file = new File(FilePath);
            }

            catch (System.IO.FileNotFoundException)
            {
                throw;
            }

            if (file != null)
            {
                Document pdfDocument = file.Document;

                TextExtractor textExtractor = new TextExtractor();
                foreach (Page page in pdfDocument.Pages)
                {
                    IList <ITextString> textStrings = textExtractor.Extract(page)[TextExtractor.DefaultArea];
                    foreach (ITextString textString in textStrings)
                    {
                        StringBuilder sb = new StringBuilder();
                        foreach (char letter in textString.Text)
                        {
                            sb.Append(letter);
                        }

                        // The whole line
                        string finalString = sb.ToString().Trim();
                        Match  match       = extractRegex.Match(finalString);
                        if (match.Success)
                        {
                            Match matchId       = idRegex.Match(finalString);
                            Match matchName     = nameRegex.Match(finalString);
                            Match matchDiscount = discountRegex.Match(finalString);

                            string idString       = matchId.Value.Trim();
                            string nameString     = matchName.Value.Trim();
                            string discountString = matchDiscount.Value.Trim();

                            // remove percentage sign from discount string
                            discountString = discountString.Remove(discountString.Length - 1);

                            Client client = new Client();
                            client.Id       = Convert.ToInt32(idString);
                            client.Name     = nameString;
                            client.Discount = (float)Convert.ToDecimal(discountString);

                            yield return(client);
                        }
                    }
                }
            }

            else
            {
                throw new NullReferenceException();
            }
        }
        public List <Transaction> Extract(Document document)
        {
            var extractor    = new TextExtractor();
            var inProgress   = false;
            var transactions = new List <Transaction>();
            var pgCnt        = 0;
            var year         = 1977;

            foreach (var page in document.Pages)
            {
                pgCnt++;
                IList <ITextString> pageTextStrings = new List <ITextString>();
                try
                {
                    pageTextStrings = extractor.Extract(page)[TextExtractor.DefaultArea];
                }
                catch
                {
                    continue;
                }
                var headerSplit = pageTextStrings[0].Text.Split(' ').ToList();
                var dashIdx     = headerSplit.IndexOf("-");
                if (dashIdx > -1)
                {
                    int.TryParse(headerSplit[dashIdx - 1], out year);
                }
                var accountPage = pageTextStrings.Any(x => x.Text.Contains(account));
                var hasBalances = pageTextStrings.Any(x => x.Text.Contains(begining));

                inProgress = (!inProgress && accountPage && hasBalances) || inProgress;

                if (!inProgress)
                {
                    continue;
                }

                var hasEof = pageTextStrings.Any(y => y.Text.Contains(eof));
                inProgress = !hasEof;

                var startIdx = -1;
                var contText = pageTextStrings.FirstOrDefault(y => y.Text.Contains(flag1));
                if (hasBalances)
                {
                    startIdx = pageTextStrings.IndexOf(pageTextStrings.First(y => y.Text.Contains(begining))) + 1;
                }
                else if (contText != null)
                {
                    startIdx = pageTextStrings.IndexOf(contText) + 1;
                }
                else
                {
                    throw new Exception("Could not determine where to start!");
                }

                Transaction transactionNowAndPrevious = null;
                for (var i = startIdx; i < pageTextStrings.Count; i++)
                {
                    var  textString = pageTextStrings[i];
                    var  textParts  = textString.Text.Trim().Split(' ');
                    long eofTest;
                    if (textParts.Length == 1 && !string.IsNullOrEmpty(textParts[0]) && long.TryParse(textParts[0], out eofTest))
                    {
                        // end of page
                        continue;
                    }
                    if (textString.Text.Contains(eof))
                    {
                        break;
                    }

                    decimal  amnt     = 0;
                    decimal  balance  = 0;
                    DateTime dateTest = DateTime.MinValue;
                    if (textParts.Length > 1 && textParts[textParts.Length - 2].Contains(".") && decimal.TryParse(textParts[textParts.Length - 2], out amnt) && decimal.TryParse(textParts[textParts.Length - 1], out balance))
                    {
                        if (textParts[0].Contains("/"))
                        {
                            var dateString = $"{textParts[0]}/{year}";
                            DateTime.TryParse(dateString, out dateTest);
                        }

                        var desc = textParts.ToList();
                        desc.RemoveAt(textParts.Length - 1);
                        desc.RemoveAt(textParts.Length - 2);
                        desc.RemoveAt(0);
                        desc = desc.Select(x => x.Trim().Replace(',', '|')).ToList();

                        var tstChrs = textString.TextChars.Where(y => y.Virtual == true).ToList();
                        var tst     = tstChrs.Count > 2 ? tstChrs[tstChrs.Count - 3] : tstChrs[0];
                        var typ     = tst.Box.Right < 433 ? TransactionType.Deposit : TransactionType.Withdrawal;

                        int cardTst = -1;
                        int?card    = null;
                        if ((desc.Count > 1 && desc[desc.Count - 2].ToLower() == "card" || desc[desc.Count - 1].Trim().Length == 4) && int.TryParse(desc[desc.Count - 1], out cardTst))
                        {
                            card = cardTst;
                        }

                        transactionNowAndPrevious = new Transaction
                        {
                            Id          = Guid.NewGuid(),
                            Amount      = typ == TransactionType.Deposit ? amnt : amnt * -1,
                            Date        = dateTest,
                            Description = string.Join(" ", desc),
                            Type        = typ,
                            Card        = card
                        };
                    }
                    else if (textParts.Length > 0 && decimal.TryParse(textParts[textParts.Length - 1], out amnt) && DateTime.TryParse($"{textParts[0]}/{year}", out dateTest))
                    {
                        // No balance
                        var desc = textParts.ToList();
                        desc.RemoveAt(textParts.Length - 1);
                        desc.RemoveAt(0);
                        desc = desc.Select(x => x.Trim().Replace(',', '|')).ToList();

                        int cardTst = -1;
                        int?card    = null;
                        if ((desc.Count > 1 && desc[desc.Count - 2].ToLower() == "card" || desc[desc.Count - 1].Trim().Length == 4) && int.TryParse(desc[desc.Count - 1], out cardTst))
                        {
                            card = cardTst;
                        }

                        var tstChrs = textString.TextChars.Where(y => y.Virtual == true).ToList();
                        var tst     = tstChrs.Count > 1 ? tstChrs[tstChrs.Count - 2] : tstChrs[0];
                        var typ     = tst.Box.Right < 433 ? TransactionType.Deposit : TransactionType.Withdrawal;

                        transactionNowAndPrevious = new Transaction
                        {
                            Id          = Guid.NewGuid(),
                            Amount      = typ == TransactionType.Deposit ? amnt : amnt * -1,
                            Date        = dateTest,
                            Description = string.Join(" ", desc),
                            Type        = typ,
                            Card        = card
                        };
                    }
                    else
                    {
                        int cardTst = -1;
                        int?card    = null;
                        var desc    = textString.Text.Trim().Split(' ').ToList();
                        if ((desc.Count > 1 && desc[desc.Count - 2].ToLower() == "card" || desc[desc.Count - 1].Trim().Length == 4) && int.TryParse(desc[desc.Count - 1], out cardTst))
                        {
                            card = cardTst;
                        }

                        transactionNowAndPrevious              = transactionNowAndPrevious ?? new Transaction();
                        transactionNowAndPrevious.Card         = card;
                        transactionNowAndPrevious.Description += $" {textString.Text}";
                        transactions.Add(transactionNowAndPrevious);
                    }
                }
            }
            return(transactions);
        }
예제 #27
0
        static void Main(string[] args)
        {
            ServicePointManager.DefaultConnectionLimit = 10000; //(Or More)

            System.Console.WriteLine(String.Format("Total Min: {0}", DateTime.Now.Subtract(stTime).TotalMinutes));

            Console.WriteLine("{0}", "Deleting index...\n");
            DeleteIndexIfExists(SearchClient, SearchIndexName);

            Console.WriteLine("{0}", "Creating index...\n");
            CreateIndex(SearchClient);

            Console.WriteLine("{0}", "Getting file...\n");
            GetFiles();

            ParallelOptions po = new ParallelOptions();

            po.MaxDegreeOfParallelism = Parallelism;

            int docCounter = 0;

            Parallel.ForEach(FilesToProcess, po, fileList =>
            {
                Console.WriteLine(String.Format("Processing batch #{0}", fileList.Key));
                var textExtractor = new TextExtractor();
                CloudStorageAccount blobStorageAccount = CloudStorageAccount.Parse(SourceBlobConectionString);
                var blobBlobClient = blobStorageAccount.CreateCloudBlobClient();
                var blobContainer  = blobBlobClient.GetContainerReference(SourceBlobContainer);
                var containerUrl   = blobContainer.Uri.AbsoluteUri;

                SearchIndexClient indexClient = new SearchIndexClient(SearchServiceName, SearchIndexName, new SearchCredentials(SearchAdminApiKey));


                SearchIndexSchema schema = new SearchIndexSchema();
                var indexActionList      = new List <IndexAction <SearchIndexSchema> >();


                foreach (var file in fileList.Value)
                {
                    try
                    {
                        Interlocked.Increment(ref docCounter);
                        if (docCounter % 100 == 0)
                        {
                            System.Console.WriteLine(String.Format("Completed {0} docs in {1} min...", docCounter, DateTime.Now.Subtract(stTime).TotalMinutes));
                        }

                        schema = new SearchIndexSchema();

                        string sasURL = GetBlobSasUri(blobContainer, file);
                        Uri uri       = new Uri(sasURL);

                        var result       = textExtractor.Extract(uri);
                        var blobMetadata = result.Metadata;

                        schema.content = result.Text;
                        schema.metadata_storage_name = blobContainer.Uri.ToString() + "/" + file;
                        schema.metadata_storage_path = HttpServerUtility.UrlTokenEncode(Encoding.ASCII.GetBytes(schema.metadata_storage_name));

                        schema.metadata_content_type  = blobMetadata.ContainsKey("Content-Type") == false ? null : blobMetadata["Content-Type"];
                        schema.metadata_last_modified = blobMetadata.ContainsKey("Last-Modified") == false ? DateTime.Parse("1900-01-01") : DateTimeOffset.Parse(blobMetadata["Last-Modified"]);

                        schema.metadata_word_count      = Utilities.WordCount(schema.content);
                        schema.metadata_storage_size    = Convert.ToInt32(blobMetadata.ContainsKey("Content-Length") == false ? null : blobMetadata["Content-Length"]);
                        schema.metadata_character_count = schema.content.Length;
                        schema.metadata_author          = blobMetadata.ContainsKey("dc:creator") == false ? "" : blobMetadata["dc:creator"];

                        CloudBlockBlob blockBlob = blobContainer.GetBlockBlobReference(file);
                        blockBlob.FetchAttributes();
                        schema.metadata_storage_content_type  = blockBlob.Properties.ContentType;
                        schema.metadata_storage_content_md5   = blockBlob.Properties.ContentMD5;
                        schema.metadata_storage_last_modified = DateTimeOffset.Parse(blockBlob.Properties.LastModified.Value.DateTime.ToString());
                        schema.metadata_storage_size          = blockBlob.StreamWriteSizeInBytes;

                        var action = IndexAction.Upload(schema);
                        indexActionList.Add(action);

                        //Console.WriteLine("Indexing Counter: " + docCounter);
                        indexClient.Documents.Index(IndexBatch.New(indexActionList));
                    }
                    catch (IndexBatchException ibe)
                    {
                        // Sometimes when your Search service is under load, indexing will fail for some of the documents in
                        // the batch. Depending on your application, you can take compensating actions like delaying and
                        // retrying. For this simple demo, we just log the failed document keys and continue.
                        Console.WriteLine(
                            "Failed to index some of the documents: {0}",
                            String.Join(", ", ibe.IndexingResults.Where(r => !r.Succeeded).Select(r => r.Key)));
                    }
                    catch (Exception ex)
                    {
                        // Sometimes when your Search service is under load, indexing will fail for some of the documents in
                        // the batch. Depending on your application, you can take compensating actions like delaying and
                        // retrying. For this simple demo, we just log the failed document keys and continue.
                        Console.WriteLine(ex.Message);
                    }
                    indexActionList.Clear();
                }
            });

            System.Console.WriteLine(String.Format("Completed {0} docs in {1} min", FilesToProcess.Count, DateTime.Now.Subtract(stTime).TotalMinutes));
        }
예제 #28
0
        private void ExtractItems()
        {
            bool extractText       = false;
            bool extractImgs       = false;
            bool extractVideo      = false;
            bool extractSharedItem = false;

            switch (momentInfo.momentType)
            {
            case MomentType.TextOnly:
                extractText = true;
                break;

            case MomentType.WithImg:
                extractText = true;
                extractImgs = true;
                break;

            case MomentType.WithShortVideo:
                extractText  = true;
                extractVideo = true;
                break;

            case MomentType.Shared:
                extractText       = true;
                extractSharedItem = true;
                break;
            }



            for (int i = 0; i < _object.Count; i++)
            {
                Type currentNodeType = _object[i].GetType();
                if (currentNodeType == typeof(NSDictionary))
                {
                    NSDictionary currentItem = (NSDictionary)_object[i];
                    if (currentItem.ContainsKey("bDeleted"))//评论或点赞
                    {
                        DateTime time = TimeUtil.TimeStamp2Datetime(currentItem["createTime"].ToString());
                        string   type = currentItem["type"].ToString();
                        if (type.ToString() == "1")//点赞
                        {
                            IExtractor <LikedInfo> likeExtractor = new LikeExtractor(_object, i, time);
                            likes.Add(likeExtractor.Extract());
                        }
                        else if (type.ToString() == "2")//评论
                        {
                            IExtractor <CommentInfo> commentExtractor = new CommentExtractor(_object, i, time);
                            CommentInfo comment = commentExtractor.Extract();
                            if (comment != null)
                            {
                                comments.Add(comment);
                            }
                        }
                    }
                    else if (extractText && currentItem.ContainsValue("WCAppInfo"))//文字
                    {
                        IExtractor <string> textExtractor = new TextExtractor(_object, i, momentInfo.posterInfo);
                        momentInfo.momentText = textExtractor.Extract();
                    }
                    else if (extractImgs && currentItem.ContainsKey("encIdx"))//附图
                    {
                        IExtractor <Uri> imgExtractor = new ImgExtractor(_object, i);
                        Uri imgUrl = imgExtractor.Extract();
                        if (imgUrl != null)
                        {
                            imgs.Add(imgUrl);
                        }
                    }
                    else if (extractVideo && currentItem.ContainsValue("WCUrl"))//视频
                    {
                        IExtractor <Uri> shortVideoExtractor = new ShortVideoExtractor(_object, i);
                        momentInfo.shortVideoUrl = shortVideoExtractor.Extract();
                    }
                    //else if (extractSharedItem)//分享
                    //{

                    //    IExtractor<SharedItem> sharedExtractor = new SharedExtractor(_object, i);
                    //    momentInfo.sharedItem = sharedExtractor.Extract();
                    //}
                }
                else if (extractSharedItem && currentNodeType == typeof(NSString))
                {
                    NSString currentItem = (NSString)_object[i];
                    if (currentItem.Content == "WeChat Sight")//微视分享
                    {
                        IExtractor <SharedItem> weishiSharedExtractor = new WeishiSharedExtractor(_object, i);
                        momentInfo.sharedItem = weishiSharedExtractor.Extract();
                    }
                }
            }
        }
예제 #29
0
        private List<string> ExtractPageList(Document document)
        {
            //  extract page list
            List<string> pageList = new List<string>();
            TextExtractor textExtractor = new TextExtractor();
            try {
                foreach (var page in document.Pages)
                {

                    var textStrings = textExtractor.Extract(page);
                    string pageContent = TextExtractor.ToString(textStrings);
                    //string[] ssize = content.Split(null);   //  splits by whitespace
                    pageList.Add(pageContent);
                }
            }
            catch (Exception e)
            {
                Utility.Log("Blad");
            }
            return pageList;
        }
예제 #30
0
        public void non_existing_files_should_fail_with_exception()
        {
            const string fileName = "files/doesnotexist.mp3";

            typeof(TextExtractionException).ShouldBeThrownBy(() => _cut.Extract(fileName))
            .Message.ShouldContain(fileName);
        }
예제 #31
0
        public void FixPageRef(string outputPath, string pdfFilePath, string footerPrefix = "Belastingblad 2014/")
        {
            var mainSection = new InputData(Path.Combine(outputPath, "Section0000.html"));
            var body        = mainSection.Document.Descendants().First(e => "body".Equals(e.Name.LocalName));
            var divs        = body.Elements()
                              .Where(
                e =>
                "div".Equals(e.Name.LocalName) && e.Attribute("class") != null &&
                "hftekst".Equals(e.Attribute("class").Value));
            Dictionary <int, string> pdfPages = TextExtractor.Extract(pdfFilePath);

            var collected = new HashSet <int>();

            foreach (var div in divs)
            {
                foreach (var level1 in div.Elements())
                {
                    if (level1.Elements().Any(e => "div".Equals(e.Name.LocalName) && e.Attribute("class") != null && ("kopgegp".Equals(e.Attribute("class").Value) || "hftekst".Equals(e.Attribute("class").Value) || "auteurgeg".Equals(e.Attribute("class").Value))))
                    {
                        foreach (var level2 in level1.Elements())
                        {
                            if (level2.Elements().Any(e => "div".Equals(e.Name.LocalName) && e.Attribute("class") != null && ("kopgegp".Equals(e.Attribute("class").Value) || "hftekst".Equals(e.Attribute("class").Value) || "auteurgeg".Equals(e.Attribute("class").Value))))
                            {
                                foreach (var level3 in level2.Elements())
                                {
                                    if (level3.Elements().Any(e => "div".Equals(e.Name.LocalName) && e.Attribute("class") != null && ("kopgegp".Equals(e.Attribute("class").Value) || "hftekst".Equals(e.Attribute("class").Value) || "auteurgeg".Equals(e.Attribute("class").Value))))
                                    {
                                        foreach (var level4 in level3.Elements())
                                        {
                                            string val = ValueOfElement(level4);
                                            foreach (
                                                var pdfPage in pdfPages.Where(e => !collected.Contains(e.Key) && e.Key > 2))
                                            {
                                                string[] lines = pdfPage.Value.Split('\n');
                                                int      lastLinesHasBtwBrief = lines.Count() - 1;
                                                while (lastLinesHasBtwBrief > -1)
                                                {
                                                    if (lines[lastLinesHasBtwBrief].Contains(footerPrefix))
                                                    {
                                                        break;
                                                    }
                                                    lastLinesHasBtwBrief--;
                                                }
                                                if (lastLinesHasBtwBrief + 1 == lines.Count())
                                                {
                                                    continue;
                                                }
                                                string firstSentence =
                                                    Regex.Replace(lines[lastLinesHasBtwBrief + 1].ToLower(), @"\s+", "")
                                                    .TrimEnd('-');
                                                while (firstSentence.Length <= 40)
                                                {
                                                    firstSentence =
                                                        Regex.Replace(lines[lastLinesHasBtwBrief++ + 1].ToLower(), @"\s+",
                                                                      "")
                                                        .TrimEnd('-');
                                                    if (lastLinesHasBtwBrief == lines.Count() - 1)
                                                    {
                                                        break;
                                                    }
                                                }
                                                if (val.Contains(firstSentence))
                                                {
                                                    //level1.Add(new XElement("pageref", pdfPage.Key));
                                                    var pagerefdiv = new XElement("div",
                                                                                  new XAttribute("class", "pagerefdiv"));
                                                    var a = new XElement("a",
                                                                         new XAttribute("class", "pcalibre pageref pcalibre1"),
                                                                         new XAttribute("href", "#backpageref_" + pdfPage.Key),
                                                                         new XAttribute("id", "pageref_" + pdfPage.Key),
                                                                         new XText(pdfPage.Key.ToString()));
                                                    pagerefdiv.Add(a);
                                                    level4.Add(pagerefdiv);
                                                    collected.Add(pdfPage.Key);
                                                    break;
                                                }
                                            }
                                        }
                                    }
                                    else
                                    {
                                        string val = ValueOfElement(level3);
                                        foreach (
                                            var pdfPage in pdfPages.Where(e => !collected.Contains(e.Key) && e.Key > 2))
                                        {
                                            string[] lines = pdfPage.Value.Split('\n');
                                            int      lastLinesHasBtwBrief = lines.Count() - 1;
                                            while (lastLinesHasBtwBrief > -1)
                                            {
                                                if (lines[lastLinesHasBtwBrief].Contains(footerPrefix))
                                                {
                                                    break;
                                                }
                                                lastLinesHasBtwBrief--;
                                            }
                                            if (lastLinesHasBtwBrief + 1 == lines.Count())
                                            {
                                                continue;
                                            }
                                            string firstSentence =
                                                Regex.Replace(lines[lastLinesHasBtwBrief + 1].ToLower(), @"\s+", "")
                                                .TrimEnd('-');
                                            while (firstSentence.Length <= 40)
                                            {
                                                firstSentence =
                                                    Regex.Replace(lines[lastLinesHasBtwBrief++ + 1].ToLower(), @"\s+",
                                                                  "")
                                                    .TrimEnd('-');
                                                if (lastLinesHasBtwBrief == lines.Count() - 1)
                                                {
                                                    break;
                                                }
                                            }
                                            if (val.Contains(firstSentence))
                                            {
                                                //level1.Add(new XElement("pageref", pdfPage.Key));
                                                var pagerefdiv = new XElement("div",
                                                                              new XAttribute("class", "pagerefdiv"));
                                                var a = new XElement("a",
                                                                     new XAttribute("class", "pcalibre pageref pcalibre1"),
                                                                     new XAttribute("href", "#backpageref_" + pdfPage.Key),
                                                                     new XAttribute("id", "pageref_" + pdfPage.Key),
                                                                     new XText(pdfPage.Key.ToString()));
                                                pagerefdiv.Add(a);
                                                level3.Add(pagerefdiv);
                                                collected.Add(pdfPage.Key);
                                                break;
                                            }
                                        }
                                    }
                                }
                            }
                            else
                            {
                                string val = ValueOfElement(level2);
                                foreach (var pdfPage in pdfPages.Where(e => !collected.Contains(e.Key) && e.Key > 2))
                                {
                                    string[] lines = pdfPage.Value.Split('\n');
                                    int      lastLinesHasBtwBrief = lines.Count() - 1;
                                    while (lastLinesHasBtwBrief > -1)
                                    {
                                        if (lines[lastLinesHasBtwBrief].Contains(footerPrefix))
                                        {
                                            break;
                                        }
                                        lastLinesHasBtwBrief--;
                                    }
                                    if (lastLinesHasBtwBrief + 1 == lines.Count())
                                    {
                                        continue;
                                    }
                                    string firstSentence =
                                        Regex.Replace(lines[lastLinesHasBtwBrief + 1].ToLower(), @"\s+", "")
                                        .TrimEnd('-');
                                    while (firstSentence.Length <= 40)
                                    {
                                        firstSentence =
                                            Regex.Replace(lines[lastLinesHasBtwBrief++ + 1].ToLower(), @"\s+", "")
                                            .TrimEnd('-');
                                        if (lastLinesHasBtwBrief == lines.Count() - 1)
                                        {
                                            break;
                                        }
                                    }
                                    if (val.Contains(firstSentence))
                                    {
                                        //level1.Add(new XElement("pageref", pdfPage.Key));
                                        var pagerefdiv = new XElement("div", new XAttribute("class", "pagerefdiv"));
                                        var a          = new XElement("a", new XAttribute("class", "pcalibre pageref pcalibre1"),
                                                                      new XAttribute("href", "#backpageref_" + pdfPage.Key),
                                                                      new XAttribute("id", "pageref_" + pdfPage.Key),
                                                                      new XText(pdfPage.Key.ToString()));
                                        pagerefdiv.Add(a);
                                        level2.Add(pagerefdiv);
                                        collected.Add(pdfPage.Key);
                                        break;
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        string val = ValueOfElement(level1);
                        foreach (var pdfPage in pdfPages.Where(e => !collected.Contains(e.Key) && e.Key > 2))
                        {
                            string[] lines = pdfPage.Value.Split('\n');
                            int      lastLinesHasBtwBrief = lines.Count() - 1;
                            while (lastLinesHasBtwBrief > -1)
                            {
                                if (lines[lastLinesHasBtwBrief].Contains(footerPrefix))
                                {
                                    break;
                                }
                                lastLinesHasBtwBrief--;
                            }
                            if (lastLinesHasBtwBrief + 1 == lines.Count())
                            {
                                continue;
                            }
                            string firstSentence =
                                Regex.Replace(lines[lastLinesHasBtwBrief + 1].ToLower(), @"\s+", "").TrimEnd('-');
                            while (firstSentence.Length <= 40)
                            {
                                firstSentence =
                                    Regex.Replace(lines[lastLinesHasBtwBrief++ + 1].ToLower(), @"\s+", "").TrimEnd('-');
                                if (lastLinesHasBtwBrief == lines.Count() - 1)
                                {
                                    break;
                                }
                            }
                            if (val.Contains(firstSentence))
                            {
                                //level1.Add(new XElement("pageref", pdfPage.Key));
                                var pagerefdiv = new XElement("div", new XAttribute("class", "pagerefdiv"));
                                var a          = new XElement("a", new XAttribute("class", "pcalibre pageref pcalibre1"),
                                                              new XAttribute("href", "#backpageref_" + pdfPage.Key),
                                                              new XAttribute("id", "pageref_" + pdfPage.Key), new XText(pdfPage.Key.ToString()));
                                pagerefdiv.Add(a);
                                level1.Add(pagerefdiv);
                                collected.Add(pdfPage.Key);
                                break;
                            }
                        }
                    }
                }
            }
            var divcolofon = new XElement("div", new XAttribute("class", "colofon"));
            var h2         = new XElement("h2", new XAttribute("class", "kopgeghftekst"));
            var span       = new XElement("span", new XAttribute("class", "hfteksttitel"), new XAttribute("id", "colofon"), new XText("Colofon"));
            var img        = new XElement("img", new XAttribute("src", "../Images/colofon.jpg"), new XAttribute("alt", ""),
                                          new XAttribute("class", "calibre7"));

            h2.Add(span);
            divcolofon.Add(h2, img);
            body.Add(divcolofon);

            var d  = new XElement("div", new XAttribute("class", "hftekst"));
            var a1 = new XElement("a", new XAttribute("href", "../Text/Inhoud.html#paginaregister"),
                                  new XAttribute("id", "paginaregister"), new XAttribute("class", "vindplaats1"), new XText(" Paginaregister "));

            d.Add(a1);
            foreach (var i in collected)
            {
                var spanpagereglinkdiv = new XElement("span", new XAttribute("class", "pagereglinkdiv"));
                var apagereglinkdiv    = new XElement("a", new XAttribute("class", "pagereglink"),
                                                      new XAttribute("href", "#pageref_" + i), new XAttribute("id", "backpageref_" + i),
                                                      new XText(i.ToString()));
                spanpagereglinkdiv.Add(apagereglinkdiv);
                d.Add(spanpagereglinkdiv);
            }
            body.Add(d);
            mainSection.Document.Write(Path.Combine(outputPath, "fixRefSection0000.html"), Formatting.Indented);
        }
예제 #32
0
        public void TestTranslateScaliger()
        {
            var te          = new TextExtractor();
            var pdfContents = te.Extract(@"Iosephi_Scaligeri_Opus_de_emendatione_te.pdf");

            Assert.IsNotNull(pdfContents);

            var lines = FormatLines(pdfContents);

            Assert.IsTrue(lines.Length > 10);

            var pages = new List <string>();

            var output = new StringBuilder();

            foreach (var line in lines)
            {
                if (output.Length + line.Length < 5000)
                {
                    output.Append(line);
                }
                else
                {
                    pages.Add(output.ToString());
                    output.Clear();
                }
            }
            if (output.Length > 0)
            {
                pages.Add(output.ToString());
            }

            Assert.IsNotNull(pages[0]);

            var translatedText = new List <string>();

            TranslationClient client = TranslationClient.Create();
            int startPage            = 162;

            for (int i = startPage; i < pages.Count; i++)
            {
                var page = pages[i];
                try
                {
                    if (page.Length > 0)
                    {
                        //var response = client.TranslateText(page.Replace("fign","sign").Replace("ff", "ss").Replace("bf", "bs").Replace("fs", "ss").Replace("fc", "sc").Replace("ft", "st").Replace("fol", "sol"), "en");
                        //translatedText.Add(response.TranslatedText);
                        //System.IO.File.WriteAllText("Translation\\de_emendatione_temporum_translated_" + i, response.TranslatedText);
                        //System.Threading.Thread.Sleep(5000);
                    }
                }
                catch (Exception)
                {
                }
            }

            var stream = System.IO.File.AppendText("de_emendatione_temporum_translated.txt");

            for (int i = startPage; i < translatedText.Count; i++)
            {
                var page = translatedText[i];
                stream.WriteLine("Page " + (i + 1));
                stream.WriteLine();
                stream.WriteLine(page);
                stream.WriteLine();
            }
        }
 public void TestExtractMethodWithUndefinedFile()
 {
     Assert.ThrowsException <ArgumentNullException>(
         () => TextExtractor.Extract((string)null)
         );
 }