//List<EMDoc> collection = new List<EMDoc>(); public EMDoc AddDoc(string file, string src, bool ocr, DateTime dt) { OCR = ocr; source = src; userDT = dt; EMDoc doc = new EMDoc(); string text; doc.Filename = file; //Read the file and look for the info. if (OCR) //if OCRing read it all { text = GetTextFromPDF(file, -1); doc.OCRText = text; } else //else read first page { text = GetTextFromPDF(file, 1); } if (text == null) { text = ""; } //Identify most commonly used terms in document IEnumerable <Tuple <string, int> > terms = GetMostUsedWords(text); List <string> keywords = new List <string>(); //WORK ON LATER //foreach (var t in terms) //{ // //Get keywords from SQLite...need to build the SQLite method and table in file and finish the code below that compares. Then put keywords most found in XML node // List<string> kwListText = new List<string>(); // kwListText = SQLite.GetKeyWord("dbo.key_agency_ocrtext"); // foreach (var k in kwListText) // { // if (t.Item1 == k //look in sqlite for terms to remove) // Console.WriteLine(t.Item1 + ", " + t.Item2); // } //} var matchID = Regex.Match(file, @"\\(\d+)_ _"); //match for docs from the website //emcity.org //GRAMA //Other if (source != "emcity.org") { doc.Source = source; matchID = Regex.Match(file, @"(\\)(?!.*\\)(.*)\."); doc.Id = SQLite.InsertID("dbo." + source.ToLower(), "G", Path.GetFileName(file)); } else //if it is from the website { doc.Id = matchID.Groups[1].Value; } doc.Date = GetDate(Path.GetFileName(file), text); doc.Agency = GetAgency(file, text); doc.Type = GetDocType(file, text); doc.Title = GetTitle(file, text); doc.Extension = Path.GetExtension(file); //Try to fill in the agency based on the doctype or the title since it might have changed from reading the file if (doc.Agency == "") { if (doc.Type == "Agreement" || doc.Type == "Resolution") { doc.Agency = "City Council"; } if (doc.Agency == "") { doc.Agency = GetAgency(doc.Title, ""); } } //if you have the right fields filled out, look for more info if (doc.Agency != "" && doc.Type != "" && text != null) { if (!OCR) //if not OCRing the whole doc, a bit more is needed to find council and staff members { text = GetTextFromPDF(file, 2); } if (text != null) { doc.Officials = GetOfficials(doc.Agency, doc.Type, text); doc.Staff = GetStaff(doc.Agency, doc.Type, text); doc.TypeDet = GetTypeDet(doc.Agency, doc.Type, text); if (doc.Agency == "City Council" && doc.Type == "Minutes") { doc.Votes = GetVotes(doc.Agency, doc.Type, text); } } } if (doc.Project == "") { if (text == null) { text = ""; } doc.Project = GetProject(file, text); if (doc.Project == "") { doc.Project = GetProject(doc.Agency, doc.Type, file, text); } } //Console.WriteLine(text); Console.WriteLine(doc.Id); //collection.Add(doc); return(doc); }
private void BGW_Organize_DoWork(object sender, DoWorkEventArgs e) { Log.AddMessage("Beginning organization of files...", "Information"); UpdateStatusLabel("Finding files..."); EMDocs emdocs = new EMDocs(); //List<string> eDocs = new List<string>(); if (!Directory.Exists(tbOutputOrg.Text)) { Directory.CreateDirectory(tbOutputOrg.Text); } if (Directory.Exists(tbBaseFolder.Text)) { var files = Directory.GetFiles(tbBaseFolder.Text, "*.pdf", SearchOption.AllDirectories); int currentEDoc = 1; if (files.Length == 0) { Log.AddMessage("No files found to organize...", "Information"); MessageBox.Show("No files found to organize...", "Information"); } foreach (var f in files) { EMDoc eDoc = emdocs.AddDoc(f, sSource, cbOCR.Checked, (cbFillDate.Checked ? dateTimePicker1.Value : DateTime.Now)); UpdateStatusLabel("Indexing source " + currentEDoc + " of " + files.Length + ":\n" + eDoc.Id + " - " + eDoc.Title); //eDocs.Add(em.Id + " - " + em.Title); //Filename //string saveFile = eDoc.Id + "_" + eDoc.Date.ToShortDateString() + "_" + eDoc.Agency + "_" + eDoc.Type + "_" + eDoc.Title; string ag = eDoc.Agency; string type = eDoc.Type; int year = eDoc.Date.Year; int month = eDoc.Date.Month; int day = eDoc.Date.Day; string docDir = ""; string relDocDir = ""; string xmlDir = ""; string ocrDir = ""; string relOcrDir = ""; docDir = tbOutputOrg.Text + "\\docs\\";//"\\" + ag + "\\" + type + "\\" + year + "\\"; relDocDir = "\\docs\\"; xmlDir = tbOutputOrg.Text + "\\xmls\\"; ocrDir = tbOutputOrg.Text + "\\ocr\\"; relOcrDir = "\\ocr\\"; string shortDate = eDoc.Date.ToShortDateString(); if (eDoc.Agency == "" || eDoc.Type == "" || year == 0 || eDoc.Title == "" || shortDate == "1/1/0001") { docDir = tbOutputOrg.Text + "\\_incomplete\\"; relDocDir = "\\_incomplete\\"; xmlDir = tbOutputOrg.Text + "\\_incomplete\\"; ocrDir = tbOutputOrg.Text + "\\_incomplete\\"; relOcrDir = "\\_incomplete\\"; } UpdateStatusLabel("Saving output " + currentEDoc + " of " + files.Length + ":\n" + eDoc.Id + " - " + eDoc.Title); if (!Directory.Exists(docDir)) { Directory.CreateDirectory(docDir); } if (!Directory.Exists(xmlDir)) { Directory.CreateDirectory(xmlDir); } if (cbOCR.Checked) { if (!Directory.Exists(ocrDir)) { Directory.CreateDirectory(ocrDir); } } //Path to save to File.Copy(eDoc.Filename, docDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension, true); //tbOutputOrg.Text + "\\" + ag + "\\" + year + "\\" + month + "\\" + RemoveProbChars(saveFile) + eDoc.Extension, true); UpdateStatusLabel("Building index " + currentEDoc + " of " + files.Length + ":\n" + eDoc.Id + " - " + eDoc.Title); XmlDocument xml = new XmlDocument(); // Create the XML Declaration, and append it to XML eDoc XmlDeclaration dec = xml.CreateXmlDeclaration("1.0", "UTF-8", null); xml.AppendChild(dec); // Create the root element XmlElement root = xml.CreateElement("EMDoc"); xml.AppendChild(root); XmlElement id = xml.CreateElement("WebsiteID"); id.InnerText = eDoc.Id.ToString(); root.AppendChild(id); XmlElement mime = xml.CreateElement("Mime"); mime.InnerText = eDoc.Extension; root.AppendChild(mime); XmlElement source = xml.CreateElement("Source"); source.InnerText = eDoc.Source; root.AppendChild(source); XmlElement sourceURL = xml.CreateElement("SourceURL"); if (sSource == "emcity.org") { if (eDoc.Extension.ToLower() == ".pdf") { sourceURL.InnerText = @"http://www.eaglemountaincity.org/Home/ShowDocument?id=" + eDoc.Id.ToString(); } else { sourceURL.InnerText = @"http://www.eaglemountaincity.org/Home/ShowImage?id=" + eDoc.Id.ToString(); } } root.AppendChild(sourceURL); XmlElement path = xml.CreateElement("Path"); path.InnerText = relDocDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension; //path.InnerText = tbOutputFolder.Text + "\\" + eDoc.Id + "_" + eDoc.Date.ToShortDateString() + "_" + eDoc.Title + eDoc.Extension; root.AppendChild(path); XmlElement filename = xml.CreateElement("Filename"); filename.InnerText = Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension; root.AppendChild(filename); if (cbOCR.Checked) { XmlElement ocr = xml.CreateElement("OCR"); ocr.InnerText = relOcrDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".txt"; root.AppendChild(ocr); } XmlElement flds = xml.CreateElement("Fields"); root.AppendChild(flds); XmlElement docName = xml.CreateElement("Title"); docName.InnerText = eDoc.Title; flds.AppendChild(docName); XmlElement date = xml.CreateElement("Date"); date.InnerText = eDoc.Date.ToShortDateString(); flds.AppendChild(date); XmlElement yr = xml.CreateElement("Year"); yr.InnerText = year.ToString(); flds.AppendChild(yr); XmlElement agency = xml.CreateElement("Agency"); agency.InnerText = eDoc.Agency; flds.AppendChild(agency); XmlElement docType = xml.CreateElement("DocType"); docType.InnerText = eDoc.Type; flds.AppendChild(docType); XmlElement docTypeDet = xml.CreateElement("DocTypeDetail"); docTypeDet.InnerText = eDoc.TypeDet; flds.AppendChild(docTypeDet); if (sProj != "") { if (eDoc.Project == "") { XmlElement project = xml.CreateElement("Project"); project.InnerText = sProj; flds.AppendChild(project); } { XmlElement project = xml.CreateElement("Project"); project.InnerText = eDoc.Project; flds.AppendChild(project); } } else { XmlElement project = xml.CreateElement("Project"); project.InnerText = eDoc.Project; flds.AppendChild(project); } XmlElement officials = xml.CreateElement("Officials"); officials.InnerText = eDoc.Officials; flds.AppendChild(officials); XmlElement staff = xml.CreateElement("Staff"); staff.InnerText = eDoc.Staff; flds.AppendChild(staff); XmlElement votes = xml.CreateElement("Votes"); votes.InnerText = eDoc.Votes; flds.AppendChild(votes); xml.Save(xmlDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".xml"); //tbOutputOrg.Text + "\\" + ag + "\\" + year + "\\" + month + "\\" + RemoveProbChars(saveFile) + ".xml"); if (cbOCR.Checked) { if (eDoc.OCRText != "") { UpdateStatusLabel("Saving OCR " + currentEDoc + " of " + files.Length + ":\n" + eDoc.Id + " - " + eDoc.Title); using (StreamWriter file = new System.IO.StreamWriter(ocrDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".txt", false)) { file.Write(eDoc.OCRText); } } } currentEDoc++; } } else { Console.WriteLine("Base Directory Doesn't Exist"); } //int currentEDoc = 1; //Log.AddMessage(eDocs.Count + " files found to organize", "Information"); //foreach (EMDoc eDoc in eDocs) //{ // UpdateStatusLabel("Saving files and indexes " + currentEDoc + " of " + emdocs.CollectionCount() + ":\n" + eDoc.Id + " - " + eDoc.Title); // //Filename // //string saveFile = eDoc.Id + "_" + eDoc.Date.ToShortDateString() + "_" + eDoc.Agency + "_" + eDoc.Type + "_" + eDoc.Title; // string ag = eDoc.Agency; // string type = eDoc.Type; // int year = eDoc.Date.Year; // int month = eDoc.Date.Month; // int day = eDoc.Date.Day; // string docDir = ""; // string relDocDir = ""; // string xmlDir = ""; // string ocrDir = ""; // string relOcrDir = ""; // docDir = tbOutputOrg.Text + "\\docs\\";//"\\" + ag + "\\" + type + "\\" + year + "\\"; // relDocDir = "\\docs\\"; // xmlDir = tbOutputOrg.Text + "\\xmls\\"; // ocrDir = tbOutputOrg.Text + "\\ocr\\"; // relOcrDir = "\\ocr\\"; // string shortDate = eDoc.Date.ToShortDateString(); // if (eDoc.Agency == "" || eDoc.Type == "" || year == 0 || eDoc.Title == "" || shortDate == "1/1/0001") // { // docDir = tbOutputOrg.Text + "\\_incomplete\\"; // relDocDir = "\\_incomplete\\"; // xmlDir = tbOutputOrg.Text + "\\_incomplete\\"; // ocrDir = tbOutputOrg.Text + "\\_incomplete\\"; // relOcrDir = "\\_incomplete\\"; // } // if (!Directory.Exists(docDir)) // { // Directory.CreateDirectory(docDir); // } // if (!Directory.Exists(xmlDir)) // { // Directory.CreateDirectory(xmlDir); // } // if (!Directory.Exists(ocrDir)) // { // Directory.CreateDirectory(ocrDir); // } // //Path to save to // File.Copy(eDoc.Filename, docDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension, true); //tbOutputOrg.Text + "\\" + ag + "\\" + year + "\\" + month + "\\" + RemoveProbChars(saveFile) + eDoc.Extension, true); // XmlDocument xml = new XmlDocument();// Create the XML Declaration, and append it to XML eDoc // XmlDeclaration dec = xml.CreateXmlDeclaration("1.0", "UTF-8", null); // xml.AppendChild(dec);// Create the root element // XmlElement root = xml.CreateElement("EMDoc"); // xml.AppendChild(root); // XmlElement id = xml.CreateElement("WebsiteID"); // id.InnerText = eDoc.Id.ToString(); // root.AppendChild(id); // XmlElement mime = xml.CreateElement("Mime"); // mime.InnerText = eDoc.Extension; // root.AppendChild(mime); // XmlElement source = xml.CreateElement("Source"); // source.InnerText = eDoc.Source; // root.AppendChild(source); // XmlElement sourceURL = xml.CreateElement("SourceURL"); // if (eDoc.Extension.ToLower() == ".pdf") // { // sourceURL.InnerText = @"http://www.eaglemountaincity.org/Home/ShowDocument?id=" + eDoc.Id.ToString(); // } // else // { // sourceURL.InnerText = @"http://www.eaglemountaincity.org/Home/ShowImage?id=" + eDoc.Id.ToString(); // } // root.AppendChild(sourceURL); // XmlElement path = xml.CreateElement("Path"); // path.InnerText = relDocDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension; // //path.InnerText = tbOutputFolder.Text + "\\" + eDoc.Id + "_" + eDoc.Date.ToShortDateString() + "_" + eDoc.Title + eDoc.Extension; // root.AppendChild(path); // XmlElement filename = xml.CreateElement("Filename"); // filename.InnerText = Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension; // root.AppendChild(filename); // if (cbOCR.Checked) // { // XmlElement ocr = xml.CreateElement("OCR"); // ocr.InnerText = relOcrDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".txt"; // root.AppendChild(ocr); // } // XmlElement flds = xml.CreateElement("Fields"); // root.AppendChild(flds); // XmlElement docName = xml.CreateElement("Title"); // docName.InnerText = eDoc.Title; // flds.AppendChild(docName); // XmlElement date = xml.CreateElement("Date"); // date.InnerText = eDoc.Date.ToShortDateString(); // flds.AppendChild(date); // XmlElement yr = xml.CreateElement("Year"); // yr.InnerText = year.ToString(); // flds.AppendChild(yr); // XmlElement agency = xml.CreateElement("Agency"); // agency.InnerText = eDoc.Agency; // flds.AppendChild(agency); // XmlElement docType = xml.CreateElement("DocType"); // docType.InnerText = eDoc.Type; // flds.AppendChild(docType); // XmlElement docTypeDet = xml.CreateElement("DocTypeDetail"); // docTypeDet.InnerText = eDoc.TypeDet; // flds.AppendChild(docTypeDet); // XmlElement project = xml.CreateElement("Project"); // project.InnerText = eDoc.Project; // flds.AppendChild(project); // XmlElement officials = xml.CreateElement("Officials"); // officials.InnerText = eDoc.Officials; // flds.AppendChild(officials); // XmlElement staff = xml.CreateElement("Staff"); // staff.InnerText = eDoc.Staff; // flds.AppendChild(staff); // XmlElement votes = xml.CreateElement("Votes"); // votes.InnerText = eDoc.Votes; // flds.AppendChild(votes); // xml.Save(xmlDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".xml"); //tbOutputOrg.Text + "\\" + ag + "\\" + year + "\\" + month + "\\" + RemoveProbChars(saveFile) + ".xml"); // if (cbOCR.Checked) // { // if (eDoc.OCRText != "") // { // using (StreamWriter file = new System.IO.StreamWriter(ocrDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".txt", false)) // { // file.Write(eDoc.OCRText); // } // } // } // currentEDoc++; //} }