public static WordOnPage Combine(WordOnPage wop1, WordOnPage wop2, string IndexAs) { WordOnPage wop = new WordOnPage(); wop.CombinedWords = new List <string>(); wop.CombinedWords.Add(wop1.Word); foreach (string str in wop1.CombinedWords) { wop.CombinedWords.Add(str); } return(wop); }
public async Task <bool> ProcessIt(Stream DocumentToLoad, DocumentTypes docType, bool DictionaryMatch, bool ExcludeNumberWords) { var ret = true; await Task.Run(async() => { try { int PageNum = 0; if (docType == DocumentTypes.PDF) { PagesInBook = new List <BookPage>(); using (PdfDocument pdfDoc = new PdfDocument(new PdfReader(DocumentToLoad))) { int maxPages = pdfDoc.GetNumberOfPages(); PageNum = pdfDoc.GetNumberOfPages(); StringBuilder text = new StringBuilder(); for (int x = 1; x <= pdfDoc.GetNumberOfPages(); x++) { FireUpdateEvent(EventUpdateTypes.ParsingFromPDF, "Page: " + x.ToString(), maxPages, x, Warnings.Count()); PagesInBook.Add(new BookPage(PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(x)))); } } } List <WordOnPage> index = new List <WordOnPage>(); int pageCount = 0; int pageTotal = PagesInBook.Count(); // Loop through all of the pages foreach (BookPage bp in PagesInBook) { pageCount++; FireUpdateEvent(EventUpdateTypes.Indexing, "Page: " + pageCount.ToString(), pageTotal, pageCount, Warnings.Count()); // Loop through all of the Words on the page foreach (string word in bp.Words) { bool foundInIndex = false; bool doit = true; if (ContainsNumber(word) && ExcludeNumberWords) { doit = false; } if (doit) { // Loop through our existing Index foreach (WordOnPage wop in index.Where(x => x.Word.ToLower().Trim() == word.ToLower().Trim())) { // the Word is found in the index foundInIndex = true; bool pageAlreadyMarked = false; // Loop through the pages this word is found on foreach (PageClass page in wop.Page.Where(p => p.Page == pageCount)) { pageAlreadyMarked = true; break; } // If this page was not marked for this word if (!pageAlreadyMarked) { // add the page mark wop.Page.Add(new PageClass(pageCount)); } break; } // If the word was not found in the index if (!foundInIndex) { WordOnPage nwop = new WordOnPage(); nwop.Word = word.Trim(); //nwop.WordType = GetWordType(nwop.Word); nwop.Page = new List <PageClass>(); nwop.Page.Add(new PageClass(pageCount)); index.Add(nwop); } } } } RawWordList = index.OrderBy(p => p.Word).ToList(); int c = 0; if (DictionaryMatch) { foreach (WordOnPage wop in RawWordList) { c++; FireUpdateEvent(EventUpdateTypes.DictionarySearch, wop.Word, RawWordList.Count, c, Warnings.Count()); bool found = false; foreach (DictionaryWord dic in DictionaryWords.Where(x => x.Word.ToLower().Trim() == wop.Word.ToLower().Trim())) { wop.WordType = dic.WordType; found = true; break; } if (!found) { wop.WordType = "Unknown"; } } } List <WordOnPage> finalList = new List <WordOnPage>(); StringBuilder stb = new StringBuilder(); foreach (WordOnPage wop in RawWordList) { //if (wop.) //{ // finalList.Add(wop); // //StringBuilder sbpages = new StringBuilder(); // //foreach (int page in wop.Page) // //{ // // sbpages.Append(page.ToString() + ", "); // //} // //stb.AppendLine(wop.Word + " - " + sbpages.ToString()); //} } // Serialize(ProjectFileToCreate, this); } catch (Exception ex) { Debug.WriteLine("ERROR:" + ex.Message); FireUpdateEvent(EventUpdateTypes.Error, ex.Message, 0, 0, 0); } }); return(ret); }