public string PdfToText() { string pdfText = String.Empty; PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream(PdfFile))); parser.parse(); PDDocument originialPdfDoc = parser.getPDDocument(); bool isOriginalDocEncrypted = originialPdfDoc.isEncrypted(); if (isOriginalDocEncrypted) { originialPdfDoc.openProtection(new StandardDecryptionMaterial(PdfPassword)); } PDFTextStripper stripper = new PDFTextStripper(); try { pdfText = stripper.getText(originialPdfDoc); } catch (java.io.IOException ex) { throw ex; } return(pdfText); }
public void readfrompdffile() { string clearedfilepath = requiredPath.Replace("file:\\", "").Replace("\\", "/"); string file = clearedfilepath + "/terms-and-conditions.pdf"; string outFile = clearedfilepath + "/terms-and-conditions.txt"; try { if (!File.Exists(file)) { file = Path.GetFullPath(file); if (!File.Exists(file)) { Console.WriteLine("Please give in the path to the PDF file."); } } PDFParser pdfParser = new PDFParser(); pdfParser.ExtractText(file, outFile); } catch (Exception exc) { Console.WriteLine(exc); } }
public static void ParsePdf() { PDFParser pdfParser = new PDFParser(); string address = @"http://ec.europa.eu/health/documents/community-register/html/h_direct_anx.htm#412_et"; string outfile = "outfile.txt"; bool result = pdfParser.ExtractText(address, outfile); }
private void LoadFileIntoDataTable() { if (File.Exists(_FileName)) { string fileType = _FileName.Substring(_FileName.LastIndexOf(".") + 1); switch (fileType.ToLower()) { case "csv": string csvText = Utilities.readFile(_FileName); _UploadedSheetTable = CsvParser.Parse(csvText); break; case "xls": case "xlsx": case "xlsm": _UploadedSheetTable = ExcelParser.Parse(_FileName); break; case "pdf": _UploadedSheetTable = PDFParser.Parse(_FileName); break; default: break; } } if (_UploadedSheetTable == null) { throw new Exception("There was an issue converting the file to a data table."); } }
private void btnStartParser_Click(object sender, RoutedEventArgs e) { string pdfPath = this.txtPDFPath.Text; if (!string.IsNullOrEmpty(pdfPath)) { DirectoryInfo dirInfos = new DirectoryInfo(pdfPath); List <FileInfo> itemfiles = new List <FileInfo>(); DateTime readFileTime = DateTime.Now; GetFiles(dirInfos, ref itemfiles); if (itemfiles != null && itemfiles.Count > 0) { foreach (var dirInfo in itemfiles) { var result = PDFParser.Parser(dirInfo.FullName, TableContainType.CSV); if (result != null && !string.IsNullOrWhiteSpace(result.Text)) { File.WriteAllText(System.IO.Path.Combine(Environment.CurrentDirectory, "result/" + dirInfo.Name.Replace(".pdf", "")) + ".txt", result.Text); } } } } }
protected void cmdGetGiroKort_Click(object sender, EventArgs e) { PDFParser parser = new PDFParser(); var list = Medlem.GetMedlemmer().Where(x => !parser.HasGiroKortBeenDownloaded(x.MemberId)).OrderBy(x => x.Årgang).ThenBy(x => x.Navn); foreach( var medlem in list ) { String medlemsNummer = medlem.MemberId; var source = PDFParser.GetGiroKortPathForPrint(medlemsNummer); String outfile = PDFParser.GetInvoice(medlemsNummer, source, PDFParser.GetInvoicePathNoFrames()); if (outfile != null) { var filename = outfile.Replace(medlemsNummer, medlem.Årgang + "-" + medlemsNummer); Response.Write("Found giro: " + filename + "<br/>"); Response.Flush(); if (System.IO.File.Exists(filename) ) System.IO.File.Delete(filename); System.IO.File.Move(outfile, filename); if (!String.IsNullOrEmpty(outfile)) { TheDownload("http://" + Request.Url.Host + ":" + Request.Url.Port + "/Upload/KIF/" + medlemsNummer.ToString() + ".pdf", txtDownloadPath.Text + "\\" + medlemsNummer.ToString() + ".pdf"); } else Response.Write("Intet girokort fundet for medlem"); } } }
public ActionResult callExtraxtText(int serviceId) { Service service = db.Services.Find(serviceId); if (service != null) { if (service.PDF != null) { string inFileName = service.PDF.PDFPath; if (inFileName != null) { inFileName = ConfigurationManager.AppSettings["pdfArchive"] + @"\PDFs\" + inFileName; PDFParser reader = new PDFParser(); string text = reader.ExtractText(serviceId, inFileName); PDF pdf = service.PDF; if (text != "false") { pdf.FacebookDescription = text; pdf.FacebookTitle = service.FirstName + " " + service.LastName + "'s Memorial Folder"; db.SaveChanges(); return(Json("success")); } } } } return(Json("Error")); }
public string PdfFields() { string pdfText = String.Empty; PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream(PdfFile))); parser.parse(); PDDocument originialPdfDoc = parser.getPDDocument(); bool isOriginalDocEncrypted = originialPdfDoc.isEncrypted(); if (isOriginalDocEncrypted) { originialPdfDoc.openProtection(new StandardDecryptionMaterial(PdfPassword)); } try { PDDocumentCatalog docCatalog = originialPdfDoc.getDocumentCatalog(); PDAcroForm acroForm = docCatalog.getAcroForm(); PDField field = acroForm.getField("Name"); if (field != null) { field.setValue("name"); } } catch (java.io.IOException ex) { throw ex; } return(pdfText); }
static void Main(string[] args) { PDFParser pdfParser = new PDFParser(); pdfParser.ExtractText(@"C:\Users\unknown\Desktop\oyak.pdf", @"C:\Users\unknown\Desktop\output.txt"); Console.ReadKey(); }
internal CorePDFDirect ReadDirectFromStream(PDFParser p, long offset, int id, int gene) { PDFInteger integer1; PDFInteger integer2; PDFName name1; CorePDFDirect direct1; long num1; CorePDFDirect direct2; p.Stream.Position = offset; try { integer1 = ((PDFInteger)p.ReadNextObject()); if ((integer1.Value != ((long)id)) && (id != -1)) { throw new PDFSyntaxException("XREF points to incorrect object. objId doesn\'t match"); } integer2 = ((PDFInteger)p.ReadNextObject()); if ((integer2.Value != ((long)gene)) && (gene != -1)) { throw new PDFSyntaxException("XREF points to incorrect object. Generation number doesn\'t match"); } name1 = ((PDFName)p.ReadNextObject()); if (name1.Value != "obj") { throw new PDFSyntaxException("XREF points to incorrect object. \'obj\' token missing"); } direct1 = ((CorePDFDirect)p.ReadNextObject()); name1 = ((PDFName)p.ReadNextObject()); if (name1.Value == "stream") { p.SkipToEOL(); num1 = p.Stream.Position; direct1 = new CorePDFStream(this, p.Stream, num1, ((CorePDFDict)direct1)); name1 = ((PDFName)p.ReadNextObject()); if (name1.Value != "endstream") { throw new PDFSyntaxException("XREF points to incorrect object. \'endstream\' token missing"); } name1 = ((PDFName)p.ReadNextObject()); } if (base.IsEncrypted) { this.DecryptObject(direct1, id, gene); } if (name1.Value == "endobj") { return(direct1); } throw new PDFSyntaxException("XREF points to incorrect object. \'endobj\' or \'stream\' token missing"); } catch (InvalidCastException) { throw new PDFSyntaxException("XREF points to incorrect object"); } return(direct2); }
private void HanldPDFFileEntry(FileEntry pFileEntry) { PDFParser pdfParser = new PDFParser(this); //pdfParser.ExtractText(file, Path.GetFileNameWithoutExtension(file) + ".txt"); if (pdfParser.IsFileContainString(pFileEntry.FilePath, mTextToSearch)) { PutFileEntryToSearchResult(pFileEntry); } }
static void Main(string[] args) { PDFParser pdfToText = new PDFParser(); iCal iCalParser = new iCal(); Parser p = new Parser(); CalendarEvent[] events = p.Parse(pdfToText.GetString("bruno.pdf")).Where(i => i.Title == EventType.Standard).ToArray(); byte[] ics = iCalParser.ICalSerializeToBytes(iCalParser.CreateICalendar(events), "hej"); File.WriteAllBytes("output.txt", ics); }
private void InsertPdfTextToDbAsync(int id, string path) { var record = db.Documents.Where(rec => rec.id == id).FirstOrDefault(); PDFParser pDF = new PDFParser(_appEnvironment); Task <string> task = Task <string> .Factory.StartNew(pDF.ReadPdfFile, path); task.Wait(); record.Desc2 = task.Result; AddToIndexDocAsync(id); db.SaveChanges(); }
private static PDFFileData GetData(DbPDFDocument sourceDocument, byte[] data) { if (data == null) { return(sourceDocument.Data); } var fileData = sourceDocument?.Data ?? new PDFFileData(); fileData.Content = data; fileData.ExtractedText = PDFParser.ExtractTextFromPdf(data); return(fileData); }
//Conversão do PDF public void ReadPDF() { IWebDriver driver = new ChromeDriver(); URL TestURL = new URL(driver.Url); BufferedInputStream TestFile = new BufferedInputStream(TestURL.openStream()); PDFParser TestPDF = new PDFParser(TestFile); TestPDF.parse(); String TestText = new PDFTextStripper().getText(TestPDF.getPDDocument()); System.IO.File.WriteAllText(@"C:\Users\Nicolas PC\Desktop\teste\PDFTESTE.txt", TestText); }
public static string DownloadFile(string url) { string fileLocation = ConfigurationManager.AppSettings["downloadLocation"]; string fileName = DateTime.Now.Hour.ToString() + DateTime.Now.Minute.ToString() + DateTime.Now.Second.ToString() + DateTime.Now.Millisecond.ToString(); string fullFileName = fileLocation + @"\" + fileName + ".pdf"; string result = ""; try { using (WebClient client = new WebClient()) { client.DownloadFile(url, fullFileName); } PDFParser pdfParser = new PDFParser(); pdfParser.ExtractText(fullFileName, System.IO.Path.GetFileNameWithoutExtension(fullFileName) + ".txt"); try { //using (StreamReader sr = new StreamReader(System.IO.Path.GetFileNameWithoutExtension(fullFileName) + ".txt")) //{ // // Read the stream to a string, and write the string to the console. // result = sr.ReadToEnd(); //} //string output= System.IO.Path.GetFileNameWithoutExtension(fullFileName) + ".txt"; // var bytes = File.ReadAllBytes(fullFileName); // File.WriteAllText(output, ConvertToText(bytes), Encoding.UTF8); var bytes = File.ReadAllBytes(fullFileName); result = ConvertToText(bytes); } catch (Exception ex) { ErrorUtil.logError(ex, ""); } } catch (Exception ex) { ErrorUtil.logError(ex, ""); } return(result); }
public void Infocrim() { var options = new ChromeOptions(); options.AddArguments("headless"); //using (IWebDriver driver = new ChromeDriver("C:/inetpub/wwwroot/wwwroot",options)) using (IWebDriver driver = new ChromeDriver()) { Actions builder = new Actions(driver); //Validação driver.Navigate().GoToUrl("http://ec2-18-231-116-58.sa-east-1.compute.amazonaws.com/ "); driver.FindElement(By.Id("username")).SendKeys("fiap"); driver.FindElement(By.Id("password")).SendKeys("mpsp"); driver.FindElement(By.Id("password")).SendKeys(Keys.Enter); driver.Navigate().GoToUrl("http://ec2-18-231-116-58.sa-east-1.compute.amazonaws.com/infocrim/login.html"); driver.FindElement(By.XPath("/html/body/table/tbody/tr[3]/td/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[2]/td[4]/a/img")).Click(); driver.FindElement(By.XPath("/html/body/a/table[3]/tbody/tr/td[2]/table[1]/tbody/tr[3]/td/table/tbody/tr[2]/td/table/tbody/tr/td/div/a/img")).Click(); driver.FindElement(By.XPath("/html/body/table/tbody/tr[2]/td/table[3]/tbody/tr[2]/td[2]/a")).Click(); driver.FindElement(By.XPath("/html/body/table/tbody/tr/td/a[2]/img")).Click(); driver.FindElement(By.XPath("/html/body/print-preview-app//print-preview-sidebar//div[2]/print-preview-destination-settings//print-preview-settings-section[1]/div/print-preview-destination-select//select")).Click(); driver.FindElement(By.XPath("/html/body/print-preview-app//print-preview-sidebar//div[2]/print-preview-destination-settings//print-preview-settings-section[1]/div/print-preview-destination-select//select/option[2]")).Click(); URL url = new URL(driver.Url); BufferedInputStream fileToParse = new BufferedInputStream(url.openStream()); PDFParser parser = new PDFParser(fileToParse); parser.parse(); COSDocument cosDoc = parser.getDocument(); PDDocument pdDoc = new PDDocument(cosDoc); PDFTextStripper pdfStripper = new PDFTextStripper(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); string parsedText = pdfStripper.getText(cosDoc); string saida = new PDFTextStripper().getText(parser.getPDDocument()); System.IO.File.WriteAllText(@"C:\Users\favar\Desktop\Texto\Infocrim.txt", saida); } }
static void Main() { string FileLoc; do { Console.Write("Enter File Location : "); FileLoc = Console.ReadLine().ToLower(); } while (!File.Exists(FileLoc) || !FileLoc.EndsWith(".pdf")); Start1: Console.Write("From Page : "); int.TryParse(Console.ReadLine(), out int from); if (from == default) { goto Start1; } Start2: Console.Write("To Page : "); int.TryParse(Console.ReadLine(), out int to); if (to == default) { goto Start2; } PDFParser p = new PDFParser(FileLoc, from, to); p.A_SaveExcel(); p.B_LoadExcels(); MainIterator iter = new MainIterator(p.Table); p.End(); }
public string Parse(string fileName) { //Load in file. Using java.io because pdfbox is ported from java. var pdfFile = new FileInputStream(fileName); //Load file into the pdf parser var pdfParser = new PDFParser(pdfFile); //Parse the document, so that we can get it for the COSDocument pdfParser.parse(); /* COSDocument is the in-memory representation of the PDF. see https://pdfbox.apache.org/docs/1.8.4/javadocs/org/apache/pdfbox/cos/COSDocument.html */ var cosDocument = pdfParser.getDocument(); var pdDocument = new PDDocument(cosDocument); //Instantiate text stripper. var pdfTextStripper = new PDFTextStripper(); /* Needed for only stripping specific pages pdfTextStripper.setStartPage(0); pdfTextStripper.setEndPage(pdDocument.getNumberOfPages()); */ //Needed so that we can close the pdDocument before returning from this method var strippedText = pdfTextStripper.getText(pdDocument); //This closes all storage and delete the tmp files. pdDocument.close(); cosDocument.close(); return strippedText; }
public void convertFile(string fileToConvert) { O2Thread.mtaThread( () => { //fileToConvert.error(); if (fileToConvert.fileExists()) { if (fileToConvert.extension(".pdf")) { textBox.set_Text("...processing pdf file: " + fileToConvert); var pdfParser = new PDFParser(); var tempFile = PublicDI.config.getTempFileInTempDirectory(".txt"); pdfParser.ExtractText(fileToConvert, tempFile); textBox.set_Text(tempFile.contents().fixCRLF()); Files.deleteFile(tempFile); } else { textBox.set_Text(fileToConvert.contents()); } } }); }
public void PrintDebug() { string[] allFiles = Directory.GetFiles(inDir, "*.pdf"); for (int i = 0; i < allFiles.Length; i++) { string iFile = allFiles[i]; PDFParser pdfParser = new PDFParser(); allData = pdfParser.ExtractTextArray(iFile); data = allData[0].Split('\n'); for (int j = 0; j < data.Length; j++) { if (data[j].Length > 1) { data[j] = data[j].Remove(0, 1); if (data[j][data[j].Length - 1] == ' ') { data[j] = data[j].Remove(data[j].Length - 1, 1); } } } string tData = ""; for (int j = 0; j < data.Length; j++) { data[j] = data[j].Replace('\n', ' '); tData = tData + ("data: " + j + " | " + data[j]) + "\n"; System.Console.WriteLine("data: " + j + " | " + data[j]); } string oDat = Path.Combine(outDir, Path.GetFileNameWithoutExtension(allFiles[i]) + ".txt"); File.WriteAllText(oDat, tData); } }
static void Main(string[] args) { ITextParse.ExtractText("employe-1.pdf", "iparse.txt"); //using (PdfReader reader = new PdfReader("letter.pdf")) //Index was outside the bounds of the array. //using (PdfReader reader = new PdfReader("employe-1.pdf")) //ok //using (PdfReader reader = new PdfReader("feuille_de_paie.pdf")) //Rebuild failed: trailer not found.; Original message: PDF startxref not found. using (iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader("modele-bulletin-de-salaire.pdf")) //ok { StringBuilder text = new StringBuilder(); for (int page = 1; page <= reader.NumberOfPages; page++) { iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); //ITextExtractionStrategy strategy = new LocationTextExtractionStrategy(); string currentText = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } System.IO.StreamWriter file = new System.IO.StreamWriter("itextsharp.txt"); file.WriteLine(text); file.Close(); //return text.ToString(); } //PdfSharp using (var _document = PdfReader.Open("letter.pdf", PdfDocumentOpenMode.ReadOnly)) //ok //using (var _document = PdfReader.Open("employe-1.pdf", PdfDocumentOpenMode.ReadOnly)) //ok //using (var _document = PdfReader.Open("feuille_de_paie.pdf", PdfDocumentOpenMode.ReadOnly)) //Non-negative number required. //using (var _document = PdfReader.Open("modele-bulletin-de-salaire.pdf", PdfDocumentOpenMode.ReadOnly)) //ok { System.IO.StreamWriter file = new System.IO.StreamWriter("pdfsharp.txt"); foreach (PdfPage page in _document.Pages) { var text = ExtractText(page); foreach (string s in text) { file.Write(s); } } file.Close(); } //iTextSharp too PDFParser parser = new PDFParser(); //parser.ExtractText("letter.pdf", "pdfparser.txt"); //error parser.ExtractText("employe-1.pdf", "pdfparser.txt"); //ok //parser.ExtractText("feuille_de_paie.pdf", "pdfparser.txt"); //error //parser.ExtractText("modele-bulletin-de-salaire.pdf", "pdfparser.txt"); //error //PdfSharp //string text2 = PdfTextExtractor.GetText("letter.pdf"); //ok //string text2 = PdfTextExtractor.GetText("employe-1.pdf"); //ok //string text2 = PdfTextExtractor.GetText("feuille_de_paie.pdf"); //Non-negative number required. string text2 = PdfTextExtractor.GetText("modele-bulletin-de-salaire.pdf"); //ok System.IO.StreamWriter file2 = new System.IO.StreamWriter("PdfTextExtractor.txt"); file2.Write(text2); file2.Close(); }
protected void showNoDownload_Click(object sender, EventArgs e) { PDFParser parser = new PDFParser(); var fullList = Medlem.GetMedlemmer(); var list = fullList.Where(x => !parser.HasGiroKortBeenDownloaded(x.MemberId)).OrderBy(x => x.Årgang).ThenBy(x => x.Navn); var fritaget = list.Where(x => x.Kontingentfritagelse).ToList(); rptFritaget.DataSource = fritaget; rptNoDownload.DataSource = list.Where(x => !x.Kontingentfritagelse && PDFParser.InvoiceExists(x.MemberId)).ToList(); rptNoInvoice.DataSource = list.Where(x => !x.Kontingentfritagelse && !PDFParser.InvoiceExists(x.MemberId)).ToList(); rptDiscount.DataSource = fullList.Where(x => !String.IsNullOrEmpty(x.Rabat)).ToList(); rptHasDownloaded.DataSource = fullList.Except(list); DataBind(); }
protected void send_Click(object sender, EventArgs e) { if (txtSecurity.Text != "prodknt") { Response.Write("Du skal skrive det hemmelige kodeord i kodeord!!"); return; } txtSecurity.Text = ""; PDFParser parser = new PDFParser(); String aargang = ""; foreach (var medlem in Medlem.GetMedlemmer().Where(x => txtMedlemsnummer.Text == "" || txtMedlemsnummer.Text == x.MemberId).OrderBy(x => x.Årgang)) { if (aargang != medlem.Årgang) { aargang = medlem.Årgang; Response.Write("************ Årgang: " + aargang + " **********<br/>"); } if (PDFParser.InvoiceExists(medlem.MemberId) || !kontingentMails) { if (kontingentMails) { if (chkOnlySendToMembersWhoHaveNotDownloadedGiro.Checked) { // If giro has been downloaded then dont send if ( parser.HasGiroKortBeenDownloaded(medlem.MemberId) ) continue; } Response.Write(medlem.Årgang + " - " + Request.RawUrl.Replace("SendKontingentMails", "Kontingent") + "?memberId=" + medlem.MemberId + "<br/>"); } else { Response.Write(medlem.MemberId + ": " + medlem.Navn + ", " + medlem.Email); if (!medlem.AllowEmail) { Response.Write(" - VIL IKKE MODTAGE MAILS FRA KLUBBEN<br/>"); continue; } else Response.Write("<br/>"); } Response.Flush(); try { SendMail(medlem); if (txtTestMail.Text != "") return; } catch (Exception ex) { Response.Write("<b>FEJL: " + ex.ToString() + "</b>"); } } else Response.Write("<b>Intet girokort fundet for " + medlem.Navn + ", " + medlem.Årgang + ", " + medlem.MemberId + "</b><br/>"); } }
internal CorePDFDirect ReadDirect(int id) { CorePDFDirect direct1; PDFStream stream1; PDFName name1; int num4; if (!this.mXRefValid) { return(CorePDFNull.Instance); } if (id > this.mXRef.Count) { return(CorePDFNull.Instance); } XRefEntry entry1 = ((XRefEntry)this.mXRef[id]); int num5 = entry1.type; switch (num5) { case 0: { goto Label_0054; } case 1: { if (entry1.offset == ((long)-1)) { return(CorePDFNull.Instance); } goto Label_006A; } case 2: { stream1 = (this.Indirects[((int)entry1.offset)].Direct as PDFStream); if (stream1 == null) { return(CorePDFNull.Instance); } goto Label_00D0; } } goto Label_0218; Label_0054: return(CorePDFNull.Instance); Label_006A: direct1 = null; long num1 = this.mParser.Stream.Position; direct1 = this.ReadDirectFromStream(this.mParser, entry1.offset, id, entry1.generation); this.mParser.Stream.Position = num1; return(direct1); Label_00D0: name1 = (stream1.Dict["Type"] as PDFName); if (name1 == null) { throw new PDFException("Invalid Object Stream"); } if (name1.Value != "ObjStm") { throw new PDFException("Invalid object stream"); } long num2 = ((long)0); PDFInteger integer1 = (stream1.Dict["First"] as PDFInteger); if (integer1 == null) { throw new PDFException("Invalid Object stream"); } long num3 = integer1.Value; PDFInteger integer2 = (stream1.Dict["N"] as PDFInteger); if (integer2 == null) { throw new PDFException("Invalid Object stream"); } bool flag1 = false; Stream stream2 = stream1.Decode(); PDFParser parser1 = new PDFParser(stream2, this); for (num4 = 0; (num4 < integer2.Int32Value); num4 += 1) { integer1 = (parser1.ReadNextObject() as PDFInteger); if (integer1 == null) { throw new PDFException("Invalid object stream"); } if (integer1.Value != ((long)id)) { parser1.ReadNextObject(); } else { integer1 = (parser1.ReadNextObject() as PDFInteger); if (integer1 == null) { throw new PDFException("Invalid object stream"); } num2 = (num3 + integer1.Value); flag1 = true; break; } } if (!flag1) { return(CorePDFNull.Instance); } stream2.Position = num2; return((CorePDFDirect)parser1.ReadNextObject()); Label_0218: throw new PDFException("Invalid object type"); }
public static void FillDbFromPDF(MarksContext context) { if (!context.Students.Any()) { Semester sem1 = new Semester { Season = "Осінь", Year = 2016 }; Semester sem2 = new Semester { Season = "Весна", Year = 2017 }; context.Semesters.AddRange(sem1, sem2); context.SaveChanges(); string[] dirs = Directory.GetDirectories("wwwroot\\pdf"); var students = new List <Student>(); var groups = new List <Group>(); var ratings = new List <Rating>(); for (int k = 0; k < dirs.Length; k++) { string[] paths = Directory.GetFiles(dirs[k]); for (int i = 0; i < paths.Length; i++) { PDFParser parser = new PDFParser(paths[i]); List <Group> listg = parser.Groups; List <StudentData> list = parser.Students; context.Groups.AddRange(listg); context.SaveChanges(); for (int j = 0; j < list.Count; j++) { Student s = new Student { Name = list[j].Name, Group = context.Groups.First(p => p.Name == list[j].Group), IsBudgetary = list[j].Info == "контракт" ? false : true }; students.Add(s); Rating r = new Rating { Student = s, Semester = sem1, Value = list[j].Rating, Bonus = 0, Note = list[j].Info }; ratings.Add(r); } } } context.Students.AddRange(students); context.Ratings.AddRange(ratings); context.SaveChanges(); } }
public static void CheckAttachmentsForDocOrPDFText(ActiveRecord record) { //walk the field list for this record looking for attachments foreach (var fieldName in record.GetFieldNames()) { if (fieldName.Contains("Attachment") && fieldName.DoesntContain("RawText")) { //if (record.Fields.Attachment.IsDirty) { if (ActiveFieldBase.IsDirtyObj(record[fieldName].ValueObject, record[fieldName].OriginalValueObject)) { if (record[fieldName].ToString().Contains(".doc") || record[fieldName].ToString().EndsWith(".pdf") || record[fieldName].ToString().EndsWith(".rtf")) { if (!record.FieldExists(fieldName + "RawText")) { (new Sql("ALTER TABLE ", record.GetTableName().SqlizeName(), " ADD [" + fieldName + "RawText] nvarchar (MAX);")).Execute(); } string output = ""; if (record[fieldName].ToString().ToLower().EndsWith(".doc")) { OfficeFileReader.OfficeFileReader objOFR = new OfficeFileReader.OfficeFileReader(); if (objOFR.GetText(Web.MapPath(Web.Attachments) + record[fieldName].ToString(), ref output) > 0) { //ok } } else if (record[fieldName].ToString().ToLower().EndsWith(".docx")) { BewebCore.ThirdParty.ReadWordDocText.DocxToText objOFR = new DocxToText(Web.MapPath(Web.Attachments) + record[fieldName].ToString()); if ((output = objOFR.ExtractText()).Length > 0) { //ok } } else if (record[fieldName].ToString().Contains(".pdf")) { PdfToText.PDFParser pdf = new PDFParser(); if (pdf.ExtractText(Web.MapPath(Web.Attachments) + record[fieldName].ToString(), ref output)) { //ok } } else if (record[fieldName].ToString().Contains(".rtf")) { #if RTFProcessingAvailable //Create the RTF tree object RtfTree tree = new RtfTree(); //Load and parse RTF document tree.LoadRtfFile(Web.MapPath(Web.Attachments) + record[fieldName].ToString()); output = tree.Text; #else throw new Exception("rtf library not included"); #endif } if (output.Trim() != "") { (new Sql("update ", record.GetTableName().SqlizeName(), "set " + fieldName + "RawText=", output.SqlizeText(), " where ", record.GetPrimaryKeyName().SqlizeName(), "=", record.ID_Field.Sqlize(), "")).Execute(); } } else { //no doc any more if (record.FieldExists(fieldName + "RawText")) { (new Sql("update ", record.GetTableName().SqlizeName(), "set " + fieldName + "RawText=null where ", record.GetPrimaryKeyName().SqlizeName(), "=", record.ID_Field.Sqlize(), "")).Execute(); } } } } } }
public string Detran(PesquisaCPFCNPJ pesquisaCPFCNPJ) { var options = new ChromeOptions(); //options.AddArguments("headless"); options.AddArguments("no-sandbox"); using (IWebDriver driver = new ChromeDriver("C:/inetpub/wwwroot/wwwroot", options)) //using (IWebDriver driver = new ChromeDriver(options)) { Actions builder = new Actions(driver); driver.Navigate().GoToUrl("http://ec2-18-231-116-58.sa-east-1.compute.amazonaws.com/detran/login.html"); driver.FindElement(By.Id("form:j_id563205015_44efc15b")).Click(); driver.FindElement(By.Id("navigation_a_M_16")).Click(); driver.FindElement(By.XPath("//*[@id='navigation_a_F_16']")).Click(); driver.FindElement(By.Id("form:rg")).SendKeys(pesquisaCPFCNPJ.CPFCNPJ.ToString()); driver.FindElement(By.Id("form:nome")).SendKeys(pesquisaCPFCNPJ.Nome); driver.FindElement(By.LinkText("Pesquisar")).Click(); driver.SwitchTo().Window(driver.WindowHandles[1]); URL url = new URL(driver.Url); BufferedInputStream fileToParse = new BufferedInputStream(url.openStream()); PDFParser parser = new PDFParser(fileToParse); parser.parse(); COSDocument cosDoc = parser.getDocument(); PDDocument pdDoc = new PDDocument(cosDoc); PDFTextStripper pdfStripper = new PDFTextStripper(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); string parsedText = pdfStripper.getText(pdDoc); string saida = new PDFTextStripper().getText(parser.getPDDocument()); driver.SwitchTo().Window(driver.WindowHandles[0]); driver.FindElement(By.Id("navigation_a_M_16")).Click(); driver.FindElement(By.PartialLinkText("Consultar Imagem da CNH")).Click(); driver.FindElement(By.LinkText("Pesquisar")).Click(); driver.SwitchTo().Window(driver.WindowHandles[2]); //string nomePai = driver.FindElement(By.XPath("/html/body/div[4]/div/table/tbody/tr/td/div/div/form/div[3]/div/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td[2]")).Text; string nPai = driver.FindElement(By.XPath("/html/body/div[4]/div/table/tbody/tr/td/div/div/form/div[3]/div/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td[2]/table/tbody/tr[3]/td/table/tbody/tr[2]/td/span")).Text; string nMae = driver.FindElement(By.XPath("/html/body/div[4]/div/table/tbody/tr/td/div/div/form/div[3]/div/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td[2]/table/tbody/tr[4]/td/table/tbody/tr[2]/td/span")).Text; driver.SwitchTo().Window(driver.WindowHandles[0]); driver.FindElement(By.Id("navigation_a_M_18")).Click(); driver.FindElement(By.PartialLinkText("Consultar Veículo Base Estadual")).Click(); driver.FindElement(By.XPath("/html/body/div[4]/div/table/tbody/tr/td/div/div/form/div[1]/div[2]/table[2]/tbody/tr[2]/td[2]/input")).SendKeys(pesquisaCPFCNPJ.CPFCNPJ.ToString()); driver.FindElement(By.LinkText("Pesquisar")).Click(); driver.SwitchTo().Window(driver.WindowHandles[3]); URL urlCarro = new URL(driver.Url); BufferedInputStream fileToParseCarro = new BufferedInputStream(urlCarro.openStream()); PDFParser parserCarro = new PDFParser(fileToParseCarro); parserCarro.parse(); COSDocument cosDocCarro = parserCarro.getDocument(); PDDocument pdDocCarro = new PDDocument(cosDocCarro); PDFTextStripper pdfStripperCarro = new PDFTextStripper(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(1); string parsedTextCarro = pdfStripperCarro.getText(pdDocCarro); string saidaCarro = new PDFTextStripper().getText(parserCarro.getPDDocument()); string resultado = saida + nPai + nMae + saidaCarro; string[] strsplit = resultado.Replace("\r\n", ":").Split(':'); string cpf = strsplit[33].Trim(); string rg = strsplit[13].Trim(); string expeditor = strsplit[34].Trim(); string registro = strsplit[36].Trim(); string local = strsplit[38].Trim(); string espelhoPid = strsplit[40].Trim(); string emissaoCnh = strsplit[42].Trim(); string categoria = strsplit[46].Trim(); string primeiraHab = strsplit[48].Trim(); string statusCnh = strsplit[50].Trim(); string renach = strsplit[52].Trim(); string espelhoCnh = strsplit[54].Trim(); string validadeCnh = strsplit[56].Trim(); string pontuacao = strsplit[58].Trim(); string nomePai = strsplit[119].Trim(); string nomeMae = strsplit[120].Trim(); string placa = strsplit[144].Replace(" 7107 - SAO PAULO", "").Trim(); string municipioPlaca = strsplit[144].Replace("gge4223 ", "").Trim(); string renavam = strsplit[146].Replace(" 9AAAAVAU0J4001600 ", "").Trim(); string chassi = strsplit[146].Replace("01172566666 ", "").Trim(); string numMotor = strsplit[148].Replace(" 22/11/18 00", "").Trim(); string dataAltMotor = strsplit[148].Replace("CWL031481 ", "").Trim(); string tipo = strsplit[151].Replace(" 1 - IMPORTADO 16 - ALCO/GASOL", "").Trim(); string procedencia = strsplit[151].Replace("6 - AUTOMOVEL ", "").Replace(" 16 - ALCO/GASOL ", "").Trim(); string combustivel = strsplit[151].Replace("6 - AUTOMOVEL 1 - IMPORTADO ", "").Trim(); string cor = strsplit[153].Replace(" 162801 – VARIANT GL ", "").Trim(); string marcaModelo = strsplit[153].Replace("4 - BRANCA 162801 – ", "").Trim(); string categoriaAut = strsplit[155].Replace(" 1971 1972 ", "").Trim(); string anoFab = strsplit[155].Replace("1 - PARTICULAR ", "").Replace(" 1972 ", "").Trim(); string anoMod = strsplit[155].Replace("1 - PARTICULAR 1971 ", "").Trim(); string logradouro = strsplit[166].Replace(" 00121 ", "").Trim(); string numero = strsplit[166].Replace("AV LINS DE VASCONCELOS ", "").Trim(); string complemento = strsplit[182].Replace(" 010006-010 ", "").Trim(); string cep = strsplit[182].Replace("4 ANDAR ", "").Trim(); string bairro = strsplit[184].Replace(" 7107 - SAO PAULO SP ", "").Trim(); string licenciamento = strsplit[225].Replace(" 07/03/2019 ", "").Trim(); string dataLicenciamento = strsplit[225].Replace("2019 ", "").Trim(); string dataEmissaoCRV = strsplit[227].Trim(); DetranModel objDen = new DetranModel(); objDen.CNPJCPF = long.Parse(cpf.Replace(".", "").Replace("-", "")); objDen.RG = rg; objDen.Expeditor = expeditor; objDen.Registro = registro; objDen.Local = local; objDen.PID = espelhoPid; objDen.EmissaoCnh = emissaoCnh; objDen.Categoria = categoria; objDen.PrimeiraHabilitação = primeiraHab; objDen.StatusCnh = statusCnh; objDen.Renach = renach; objDen.EspelhoCnh = espelhoCnh; objDen.ValidadeCnh = validadeCnh; objDen.Pontuacao = pontuacao; objDen.NomePai = nPai; objDen.NomeMae = nMae; objDen.Placa = placa; objDen.MunicipioCarro = municipioPlaca; objDen.Renavam = renavam; objDen.Chassi = chassi; objDen.NumMotor = numMotor; objDen.DataAltMotor = dataAltMotor; objDen.Tipo = tipo; objDen.Procedencia = procedencia; objDen.Combustivel = combustivel; objDen.Cor = cor; objDen.MarcaModelo = marcaModelo; objDen.CategoriaAut = categoriaAut; objDen.Fabricacao = anoFab; objDen.Modelo = anoMod; objDen.Logradouro = logradouro; objDen.Numero = numero; objDen.Complemento = complemento; objDen.CEP = cep; objDen.Bairro = bairro; objDen.Licenciamento = licenciamento; objDen.DataLicenciamento = dataLicenciamento; objDen.DataEmissaoCRV = dataEmissaoCRV; detranRepository.Insert(objDen); string objjsonData = JsonConvert.SerializeObject(objDen, new JsonSerializerSettings { Formatting = Formatting.Indented }); //System.IO.File.WriteAllText(@"C:\Users\favar\Desktop\Texto\Detran.txt", objjsonData); return(objjsonData); } }
public void DoWork() { string[] allFiles = Directory.GetFiles(inDir, "*.pdf"); List <string> finalData = new List <string>(); for (int i = 0; i < allFiles.Length; i++) { string iFile = allFiles[i]; PDFParser pdfParser = new PDFParser(); allData = pdfParser.ExtractTextArray(iFile); data = allData[0].Split('\n'); for (int j = 0; j < data.Length; j++) { if (data[j].Length > 1) { data[j] = data[j].Remove(0, 1); if (data[j][data[j].Length - 1] == ' ') { data[j] = data[j].Remove(data[j].Length - 1, 1); } } } // Get format int formatType = 0; int lnDate = 0; int lnName = 0; int lnAction = 0; int lnTicker = 0; int lnAmount = 0; int lnPrice = 0; string fData = ""; if (data[1].Contains("SECURITY") && data[3].Contains("COMPANY")) { formatType = 1; } if (data[1].Contains("Locked Bag") && data[5].Contains("Square")) { if (GetLine("ORDINARY FULLY PAID") >= 0) { formatType = 2; } else { formatType = 3; } } if (data[3].Contains("SECURITY") && data[5].Contains("COMPANY")) { formatType = 1; } if (formatType == 1) { lnDate = GetLine("TOTAL COST") + 2; lnName = lnDate + 4; lnAction = GetLine("TAX INVOICE") + 2; lnTicker = GetLine("WE HAVE BOUGHT THE FOLLOWING SECURITIES FOR YOU") + 10; lnAmount = lnDate + 20; lnPrice = GetLine("PAYMENT METHOD - DIRECT DEBIT OF CLEARED") - 2; } if (formatType == 2) { lnDate = GetLine("CONSIDERATION (AUD)") + 2; lnName = GetLine("CONFIRMATION NO") + 2; lnAction = GetLine("TAX INVOICE") + 2; lnTicker = GetLine("ORDINARY FULLY PAID") + 4; lnAmount = GetLine("CONFIRMATION NO") + 6; lnPrice = GetLine("AVERAGE PRICE") + 2; } // An alternate of 2 if (formatType == 3) { lnDate = GetLine("CONSIDERATION (AUD)") + 2; lnName = GetLine("CONFIRMATION NO") + 2; lnAction = GetLine("TAX INVOICE") + 2; lnTicker = GetLine("ORDER COMPLETED") + 8; lnAmount = GetLine("CONFIRMATION NO") + 6; lnPrice = GetLine("AVERAGE PRICE") + 2; } if (formatType != 0) { data[lnName] = "C" + data[lnName]; if (data[lnAction] == "BUY") { data[lnAction] = "B"; } if (data[lnAction] == "SELL") { data[lnAction] = "S"; } data[lnAmount] = data[lnAmount].Replace(",", ""); fData = fData + "\"" + data[lnDate] + "\"" + ","; fData = fData + "\"" + data[lnName] + "\"" + ","; fData = fData + "\"" + data[lnAction] + "\"" + ","; fData = fData + "\"" + data[lnTicker] + "\"" + ","; fData = fData + "\"" + data[lnAmount] + "\"" + ","; fData = fData + "\"" + data[lnPrice] + "\""; } finalData.Add(fData); } if (debugMode) { PrintDebug(); } string oFile = Path.Combine(outDir, "output.txt"); File.WriteAllLines(oFile, finalData); }
/// <summary> /// Get specify file PDF content /// </summary> /// <param name="filePath"></param> /// <returns></returns> public static string GetPdfContent(string filePath) { PDFParser pdfParser = new PDFParser(); return(pdfParser.ExtractText(filePath));; }
// Insert logic for processing found files here. public static void ProcessFile(string path, string tipo, string dpto, bool actualizar) { bool indexatexto = false, indexaaudio = false, indexaimagen = false, indexahipertexto = false, indexavideo = false; Random rnd = new Random(); switch (tipo) { case "texto": indexatexto = true; break; case "hipertexto": indexahipertexto = true; break; case "video": indexavideo = true; break; case "imagen": indexaimagen = true; break; case "audio": indexaaudio = true; break; case "": indexatexto = true; indexahipertexto = true; indexavideo = true; indexaimagen = true; indexaaudio = true; break; } if (Herramientas.EsHiperTexto(path) && indexatexto && path.Contains(dpto)) { Regex trimmer = new Regex(@"\s\s+"); ScrapingBrowser Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has settings you can access in setup Browser.AllowMetaRedirect = true; HtmlNode html = GetNodes(new Uri(path)); var titulo = html.CssSelect("title").FirstOrDefault().InnerText; var body = html.CssSelect("body").FirstOrDefault().InnerText; body = Regex.Replace(body, "<.*?>", string.Empty); body = Regex.Replace(body, @"(?:(?:\r?\n)+ +){2,}", @"\n"); var f = new FileInfo(path); var fileLengthInKB = f.Length / 1024.0; Hipertexto h = new Hipertexto(); h.nombreArchivo = titulo; h.textoContenido = body; h.tamanoArchivo = fileLengthInKB; Uri u = new Uri(path); string ext = System.IO.Path.GetExtension(path); string auxiliar = "http://localhost/servidores/"; h.urlRuta = u.AbsoluteUri; var puerto = u.Port; int pos = h.urlRuta.IndexOf("servidores/") + 11; string aux2 = h.urlRuta.Substring(pos); int pos2 = aux2.IndexOf("/"); h.urlRuta = auxiliar + aux2; string servidor = aux2.Substring(0, pos2); int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; string depart = aux2.Substring(pos3); int pos4 = depart.IndexOf("/"); depart = depart.Substring(0, pos4); depart = depart.Replace("%20", " "); h.departamento = depart; h.urlRuta = auxiliar + aux2; h.tamanoArchivo = fileLengthInKB; h.formato = ext; h.idServidor = servidor; h.nombreArchivo = System.IO.Path.GetFileName(path); h.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); h.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); h.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); h.fechaUltimaActualizacion = DateTime.Now; h.hits = rnd.Next(1, 55); if (File.Exists(path)) { h.estadoActividad = 1; } Hipertexto existente = OperacionesElasticSearch.ExisteHipertexto(h); if (existente == null) { OperacionesElasticSearch.InsertarHiperTexto(h); } else { OperacionesElasticSearch.actualizarHipertexto(existente, h); } } else if (path.EndsWith(".txt") && indexatexto && path.Contains(dpto)) { var f = new FileInfo(path); var fileLengthInKB = f.Length / 1024.0; Uri u = new Uri(path); Texto t = new Texto(); string ext = System.IO.Path.GetExtension(path); string auxiliar = "http://localhost/servidores/"; t.urlRuta = u.AbsoluteUri; var puerto = u.Port; int pos = t.urlRuta.IndexOf("servidores/") + 11; string aux2 = t.urlRuta.Substring(pos); t.urlRuta = auxiliar + aux2; int pos2 = aux2.IndexOf("/"); string servidor = aux2.Substring(0, pos2); int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; string depart = aux2.Substring(pos3); int pos4 = depart.IndexOf("/"); depart = depart.Substring(0, pos4); depart = depart.Replace("%20", " "); string textoContenido = System.IO.File.ReadAllText(path); string user = System.IO.File.GetAccessControl(path).GetOwner(typeof(System.Security.Principal.NTAccount)).ToString(); t.estadoActividad = 1; t.departamento = depart; t.urlRuta = auxiliar + aux2; t.tamanoArchivo = fileLengthInKB; t.idServidor = servidor; t.textoContenido = textoContenido; t.titulo = t.nombreArchivo; t.formato = ext; t.nombreArchivo = System.IO.Path.GetFileName(path); t.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); t.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); t.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); t.fechaUltimaActualizacion = DateTime.Now; t.hits = rnd.Next(1, 55); t.autorArchivo = user; Texto existente = OperacionesElasticSearch.ExisteTexto(t); if (existente == null) { OperacionesElasticSearch.InsertarTexto(t); } else { OperacionesElasticSearch.actualizarTexto(existente, t); } } else if (Herramientas.EsWord(path) && indexatexto && path.Contains(dpto)) { var f = new FileInfo(path); var fileLengthInKB = f.Length / 1024.0; var applicationWord = new Microsoft.Office.Interop.Word.Application(); applicationWord.Visible = false; Word.Document w = applicationWord.Documents.Open(@path, ReadOnly: true); Word.Range ContentTypeProperties = w.Content; Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.Application(); object miss = System.Reflection.Missing.Value; object readOnly = true; Microsoft.Office.Interop.Word.Document docs = word.Documents.Open(path, ref miss, ref readOnly, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss); //Get Author Name object wordProperties = docs.BuiltInDocumentProperties; Type typeDocBuiltInProps = wordProperties.GetType(); Object Authorprop = typeDocBuiltInProps.InvokeMember("Item", System.Reflection.BindingFlags.Default | System.Reflection.BindingFlags.GetProperty, null, wordProperties, new object[] { "Author" });//query for author properties Type typeAuthorprop = Authorprop.GetType(); //string strAuthor = typeAuthorprop.InvokeMember("Value", System.Reflection.BindingFlags.Default | System.Reflection.BindingFlags.GetProperty, null, Authorprop, new object[] { }).ToString();//get author name string textoContenido = ""; for (int i = 0; i < docs.Paragraphs.Count; i++) { textoContenido += " \r\n " + docs.Paragraphs[i + 1].Range.Text.ToString(); } Uri u = new Uri(path); Texto t = new Texto(); string ext = System.IO.Path.GetExtension(path); string auxiliar = "http://localhost/servidores/"; t.urlRuta = u.AbsoluteUri; var puerto = u.Port; int pos = t.urlRuta.IndexOf("servidores/") + 11; string aux2 = t.urlRuta.Substring(pos); t.urlRuta = auxiliar + aux2; int pos2 = aux2.IndexOf("/"); string servidor = aux2.Substring(0, pos2); int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; string depart = aux2.Substring(pos3); int pos4 = depart.IndexOf("/"); depart = depart.Substring(0, pos4); depart = depart.Replace("%20", " "); t.departamento = depart; t.urlRuta = auxiliar + aux2; t.tamanoArchivo = fileLengthInKB; t.idServidor = servidor; t.textoContenido = textoContenido; t.titulo = t.nombreArchivo; t.formato = ext; t.nombreArchivo = System.IO.Path.GetFileName(path); t.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); t.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); t.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); t.fechaUltimaActualizacion = DateTime.Now; t.hits = rnd.Next(1, 55); if (File.Exists(path)) { t.estadoActividad = 1; } w.Close(); Texto existente = OperacionesElasticSearch.ExisteTexto(t); if (existente == null) { OperacionesElasticSearch.InsertarTexto(t); } else { OperacionesElasticSearch.actualizarTexto(existente, t); } } else if (Herramientas.EsExcel(path) && indexatexto && path.Contains(dpto)) { /*Microsoft.Office.Interop.Excel.Application app = new Microsoft.Office.Interop.Excel.Application(); * Microsoft.Office.Interop.Excel.Workbook wb = app.Workbooks.Open(@path, ReadOnly: true); * * var f = new FileInfo(path); * var fileLengthInKB = f.Length / 1024.0; * * //Create COM Objects. Create a COM object for everything that is referenced * Excel.Application xlApp = new Excel.Application(); * Excel.Workbook xlWorkbook = xlApp.Workbooks.Open(path); * Excel._Worksheet xlWorksheet = xlWorkbook.Sheets[0]; * Excel.Range xlRange = xlWorksheet.UsedRange; * int rowCount = xlRange.Rows.Count; * int colCount = xlRange.Columns.Count; * //iterate over the rows and columns and print to the console as it appears in the file * //excel is not zero based!! * //Get Author Name * String autor = wb.Author; * * String textoContenido = ""; * for (int i = 1; i <= rowCount; i++) * { * for (int j = 1; j <= colCount; j++) * { * //new line * if (j == 1) * Console.Write("\r\n"); * * //write the value to the console * if ((Excel.Range)xlRange.Cells[i, j] != null && xlRange.Cells[i, j].Value2 != null) * textoContenido += xlRange.Cells[i, j].Value2.ToString() + " "; * * //add useful things here! * } * } * * xlWorkbook.Close(); * Uri u = new Uri(path); * * Texto t = new Texto(); * t.urlRuta = u.AbsoluteUri; * string ext = System.IO.Path.GetExtension(path); * * string auxiliar = "http://localhost/servidorIntranet/"; * h.urlRuta = u.AbsoluteUri; * var puerto = u.Port; * int pos = h.urlRuta.IndexOf("servidores/") + 11 ; * string aux2 = h.urlRuta.Substring(pos); * t.urlRuta = auxiliar + aux2; * int pos2 = aux2.IndexOf("/"); * string servidor = aux2.Substring(0, pos2); * int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; * * string depart = aux2.Substring(pos3); * int pos4 = depart.IndexOf("/"); * depart = depart.Substring(0, pos4); * depart = depart.Replace("%20", " "); * * string depart = aux2.Substring(pos3); * int pos4 = depart.IndexOf("/"); * depart = depart.Substring(0, pos4); * depart = depart.Replace("%20", " "); * t.departamento = depart; * * t.idServidor = servidor; * t.urlRuta = u.AbsoluteUri; * t.tamanoArchivo = fileLengthInKB; * t.textoContenido = textoContenido; * t.titulo = t.nombreArchivo; * t.formato = ext; * t.nombreArchivo = System.IO.Path.GetFileName(path); * t.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); * t.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); * t.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); * t.fechaUltimaActualizacion = DateTime.Now; * t.hits = 0; * * if (File.Exists(path)) * t.estadoActividad = 1; * * Texto existente = OperacionesElasticSearch.ExisteTexto(t); * if (existente == null) * OperacionesElasticSearch.InsertarTexto(t); * else * OperacionesElasticSearch.actualizarTexto(existente, t); * */ } else if (Herramientas.EsPDF(path) && indexatexto && path.Contains(dpto)) { var text = new TextExtractor().Extract(path).Text; text = Regex.Replace(text, @"\s+", " "); text = text.Replace("\r", ""); text = text.Replace("\n", ""); PDFParser pdfParser = new PDFParser(); var f = new FileInfo(path); var fileLengthInKB = f.Length / 1024.0; // extract the text String resultado = ""; pdfParser.ExtractText(path, "C:\\Users\\cesar\\Desktop\\DocumentosIndeaxar\\salida.txt"); resultado = pdfParser.ToString(); String autor = ""; String textoContenido = ""; String titulo = ""; using (PdfReader reader = new PdfReader(path)){ //titulo = reader.Info["Title"]; //String ayt = reader.Info["Author"]; titulo = ""; StringBuilder text2 = new StringBuilder(); for (int i = 1; i <= reader.NumberOfPages; i++) { text2.Append(PdfTextExtractor.GetTextFromPage(reader, i)); } textoContenido = text.ToString(); } Texto t = new Texto(); Uri u = new Uri(path); t.urlRuta = u.AbsoluteUri; string auxiliar = "http://localhost/servidores/"; t.urlRuta = u.AbsoluteUri; var puerto = u.Port; int pos = t.urlRuta.IndexOf("servidores/") + 11; string aux2 = t.urlRuta.Substring(pos); t.urlRuta = auxiliar + aux2; int pos2 = aux2.IndexOf("/"); string servidor = aux2.Substring(0, pos2); int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; string depart = aux2.Substring(pos3); int pos4 = depart.IndexOf("/"); depart = depart.Substring(0, pos4); depart = depart.Replace("%20", " "); t.idServidor = servidor; t.departamento = depart; t.textoContenido = textoContenido; t.nombreArchivo = path.Substring(0, path.IndexOf(".pdf")); t.titulo = titulo; t.tamanoArchivo = fileLengthInKB; string ext = System.IO.Path.GetExtension(path); t.formato = ext; t.nombreArchivo = path.Substring(0, path.IndexOf(ext)); t.nombreArchivo = System.IO.Path.GetFileName(path); t.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); t.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); t.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); t.fechaUltimaActualizacion = DateTime.Now; t.hits = rnd.Next(1, 55); if (File.Exists(path)) { t.estadoActividad = 1; } Texto existente = OperacionesElasticSearch.ExisteTexto(t); if (existente == null) { OperacionesElasticSearch.InsertarTexto(t); } else { OperacionesElasticSearch.actualizarTexto(existente, t); } } else if (Herramientas.EsPowerPoint(path) && indexatexto && path.Contains(dpto)) { Microsoft.Office.Interop.PowerPoint.Application PowerPoint_App = new Microsoft.Office.Interop.PowerPoint.Application(); Microsoft.Office.Interop.PowerPoint.Presentations multi_presentations = PowerPoint_App.Presentations; Microsoft.Office.Interop.PowerPoint.Presentation presentation = multi_presentations.Open(path); var f = new FileInfo(path); var fileLengthInKB = f.Length / 1024.0; string textoContenido = ""; for (int i = 0; i < presentation.Slides.Count; i++) { foreach (var item in presentation.Slides[i + 1].Shapes) { var shape = (Powerpoint.Shape)item; if (shape.HasTextFrame == MsoTriState.msoTrue) { if (shape.TextFrame.HasText == MsoTriState.msoTrue) { var textRange = shape.TextFrame.TextRange; var text = textRange.Text; textoContenido += text + " "; } } } } //Get Author Name object wordProperties = presentation.BuiltInDocumentProperties; Type typeDocBuiltInProps = wordProperties.GetType(); Object Authorprop = typeDocBuiltInProps.InvokeMember("Item", System.Reflection.BindingFlags.Default | System.Reflection.BindingFlags.GetProperty, null, wordProperties, new object[] { "Author" }); //query for author properties Type typeAuthorprop = Authorprop.GetType(); string autor = typeAuthorprop.InvokeMember("Value", System.Reflection.BindingFlags.Default | System.Reflection.BindingFlags.GetProperty, null, Authorprop, new object[] { }).ToString(); //get author name Texto t = new Texto(); t.textoContenido = textoContenido; Uri u = new Uri(path); t.urlRuta = u.AbsoluteUri; string ext = System.IO.Path.GetExtension(path); t.formato = ext; t.nombreArchivo = path.Substring(0, path.IndexOf(ext)); string auxiliar = "http://localhost/servidores/"; t.urlRuta = u.AbsoluteUri; var puerto = u.Port; int pos = t.urlRuta.IndexOf("servidores/") + 11; string aux2 = t.urlRuta.Substring(pos); t.urlRuta = auxiliar + aux2; int pos2 = aux2.IndexOf("/"); string servidor = aux2.Substring(0, pos2); int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; string depart = aux2.Substring(pos3); int pos4 = depart.IndexOf("/"); depart = depart.Substring(0, pos4); depart = depart.Replace("%20", " "); t.departamento = depart; t.hits = rnd.Next(1, 55); PowerPoint_App.Quit(); presentation.Close(); t.idServidor = servidor; textoContenido = textoContenido.Trim(); t.nombreArchivo = System.IO.Path.GetFileName(path); t.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); t.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); t.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); t.fechaUltimaActualizacion = DateTime.Now; if (File.Exists(path)) { t.estadoActividad = 1; } Texto existente = OperacionesElasticSearch.ExisteTexto(t); if (existente == null) { OperacionesElasticSearch.InsertarTexto(t); } else { OperacionesElasticSearch.actualizarTexto(existente, t); } } else if (Herramientas.EsImagen(path) && indexaimagen && path.Contains(dpto)) { var f = new FileInfo(path); var fileLengthInKB = f.Length / 1024.0; string ext = System.IO.Path.GetExtension(path); List <string> eti = new List <string>(); eti.Add("imagen"); eti.Add("foto"); String titulo = path.Substring(0, path.IndexOf(ext)); FileInfo file = new FileInfo(path); int tamanio = (int)file.Length; Bitmap img = new Bitmap(path); int altura = img.Height; int anchura = img.Width; Imagen im = new Imagen(); im.pixelesAltura = altura; im.pixelesAnchura = anchura; Uri u = new Uri(path); im.urlRuta = u.AbsoluteUri; string auxiliar = "http://localhost/servidores/"; im.urlRuta = u.AbsoluteUri; var puerto = u.Port; int pos = im.urlRuta.IndexOf("servidores/") + 11; string aux2 = im.urlRuta.Substring(pos); im.urlRuta = auxiliar + aux2; int pos2 = aux2.IndexOf("/"); string servidor = aux2.Substring(0, pos2); int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; string depart = aux2.Substring(pos3); int pos4 = depart.IndexOf("/"); depart = depart.Substring(0, pos4); depart = depart.Replace("%20", " "); im.departamento = depart; im.idServidor = servidor; ext = System.IO.Path.GetExtension(path); im.formato = ext; im.nombreArchivo = System.IO.Path.GetFileName(path); im.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); im.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); im.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); im.fechaUltimaActualizacion = DateTime.Now; im.etiquetas = eti; im.hits = rnd.Next(1, 55); if (File.Exists(path)) { im.estadoActividad = 1; } Imagen existente = OperacionesElasticSearch.ExisteImagen(im); if (existente == null) { OperacionesElasticSearch.InsertarImagen(im); } else { OperacionesElasticSearch.actualizarImagen(existente, im); } } else if (Herramientas.EsAudio(path) && indexaaudio) { var fi = new FileInfo(path); var fileLengthInKB = fi.Length / 1024.0; string ext = System.IO.Path.GetExtension(path); string titulo = path.Substring(0, path.IndexOf(ext)); TagLib.File f = TagLib.File.Create(path, TagLib.ReadStyle.Average); var duracion = (int)f.Properties.Duration.TotalSeconds; List <string> eti = new List <string>(); eti.Add("audio"); eti.Add("sonido"); Audio au = new Audio(); Uri u = new Uri(path); au.urlRuta = u.AbsoluteUri; string auxiliar = "http://localhost/servidores/"; au.urlRuta = u.AbsoluteUri; var puerto = u.Port; int pos = au.urlRuta.IndexOf("servidores/") + 11; string aux2 = au.urlRuta.Substring(pos); au.urlRuta = auxiliar + aux2; int pos2 = aux2.IndexOf("/"); string servidor = aux2.Substring(0, pos2); int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; string depart = aux2.Substring(pos3); int pos4 = depart.IndexOf("/"); depart = depart.Substring(0, pos4); depart = depart.Replace("%20", " "); au.departamento = depart; au.duracion = duracion; au.etiquetas = eti; au.formato = ext; au.nombreArchivo = System.IO.Path.GetFileName(path); au.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); au.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); au.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); au.fechaUltimaActualizacion = DateTime.Now; au.hits = rnd.Next(1, 55); au.idServidor = servidor; if (File.Exists(path)) { au.estadoActividad = 1; } Audio existente = OperacionesElasticSearch.ExisteAudio(au); if (existente == null) { OperacionesElasticSearch.InsertarAudio(au); } else { OperacionesElasticSearch.actualizarAudio(existente, au); } } else if (Herramientas.EsVideo(path) && indexavideo) { string ext = System.IO.Path.GetExtension(path); string titulo = path.Substring(0, path.IndexOf(ext)); List <string> eti = new List <string>(); eti.Add("video"); var fi = new FileInfo(path); int duracion = 0; string calidad = ""; var fileLengthInKB = fi.Length / 1024.0; if (ext == ".mp4") { TagLib.File f = TagLib.File.Create(path, TagLib.ReadStyle.Average); duracion = (int)f.Properties.Duration.TotalSeconds; if (f.Properties.VideoHeight != 0 && f.Properties.VideoWidth != 0) { int height = (int)f.Properties.VideoHeight; int width = (int)f.Properties.VideoWidth; calidad = height + "x" + width; } } Uri u = new Uri(path); Video v = new Video(); v.urlRuta = u.AbsoluteUri; string auxiliar = "http://localhost/servidores/"; v.urlRuta = u.AbsoluteUri; var puerto = u.Port; int pos = v.urlRuta.IndexOf("servidores/") + 11; string aux2 = v.urlRuta.Substring(pos); v.urlRuta = auxiliar + aux2; int pos2 = aux2.IndexOf("/"); string servidor = aux2.Substring(0, pos2); int pos3 = aux2.IndexOf(servidor + "/") + servidor.Length + 1; string depart = aux2.Substring(pos3); int pos4 = depart.IndexOf("/"); depart = depart.Substring(0, pos4); depart = depart.Replace("%20", " "); depart = depart.Replace("%20", " "); v.departamento = depart; v.duracion = duracion; v.etiquetas = eti; v.calidad = calidad; v.idServidor = servidor; v.nombreArchivo = System.IO.Path.GetFileName(path); v.fechaCreacionArchivo = (DateTime)File.GetCreationTime(path); v.fechaModificacionArchivo = (DateTime)File.GetLastWriteTime(path); v.fechaUltimaLectura = (DateTime)File.GetLastAccessTime(path); v.fechaUltimaActualizacion = DateTime.Now; v.formato = ext; v.hits = rnd.Next(1, 55); if (File.Exists(path)) { v.estadoActividad = 1; } Video existente = OperacionesElasticSearch.ExisteVideo(v); if (existente == null) { OperacionesElasticSearch.InsertarVideo(v); } else { OperacionesElasticSearch.actualizarVideo(existente, v); } } }