コード例 #1
0
        //List<EMDoc> collection = new List<EMDoc>();

        public EMDoc AddDoc(string file, string src, bool ocr, DateTime dt)
        {
            OCR    = ocr;
            source = src;
            userDT = dt;

            EMDoc  doc = new EMDoc();
            string text;

            doc.Filename = file;

            //Read the file and look for the info.
            if (OCR) //if OCRing read it all
            {
                text        = GetTextFromPDF(file, -1);
                doc.OCRText = text;
            }
            else  //else read first page
            {
                text = GetTextFromPDF(file, 1);
            }

            if (text == null)
            {
                text = "";
            }

            //Identify most commonly used terms in document
            IEnumerable <Tuple <string, int> > terms = GetMostUsedWords(text);
            List <string> keywords = new List <string>();

            //WORK ON LATER
            //foreach (var t in terms)
            //{
            //    //Get keywords from SQLite...need to build the SQLite method and table in file and finish the code below that compares.  Then put keywords most found in XML node
            //    List<string> kwListText = new List<string>();
            //    kwListText = SQLite.GetKeyWord("dbo.key_agency_ocrtext");

            //    foreach (var k in kwListText)
            //    {
            //        if (t.Item1 == k //look in sqlite for terms to remove)
            //        Console.WriteLine(t.Item1 + ", " + t.Item2);
            //    }
            //}

            var matchID = Regex.Match(file, @"\\(\d+)_ _"); //match for docs from the website

            //emcity.org
            //GRAMA
            //Other

            if (source != "emcity.org")
            {
                doc.Source = source;
                matchID    = Regex.Match(file, @"(\\)(?!.*\\)(.*)\.");
                doc.Id     = SQLite.InsertID("dbo." + source.ToLower(), "G", Path.GetFileName(file));
            }
            else //if it is from the website
            {
                doc.Id = matchID.Groups[1].Value;
            }

            doc.Date   = GetDate(Path.GetFileName(file), text);
            doc.Agency = GetAgency(file, text);
            doc.Type   = GetDocType(file, text);
            doc.Title  = GetTitle(file, text);

            doc.Extension = Path.GetExtension(file);

            //Try to fill in the agency based on the doctype or the title since it might have changed from reading the file
            if (doc.Agency == "")
            {
                if (doc.Type == "Agreement" || doc.Type == "Resolution")
                {
                    doc.Agency = "City Council";
                }

                if (doc.Agency == "")
                {
                    doc.Agency = GetAgency(doc.Title, "");
                }
            }

            //if you have the right fields filled out, look for more info
            if (doc.Agency != "" && doc.Type != "" && text != null)
            {
                if (!OCR) //if not OCRing the whole doc, a bit more is needed to find council and staff members
                {
                    text = GetTextFromPDF(file, 2);
                }

                if (text != null)
                {
                    doc.Officials = GetOfficials(doc.Agency, doc.Type, text);
                    doc.Staff     = GetStaff(doc.Agency, doc.Type, text);
                    doc.TypeDet   = GetTypeDet(doc.Agency, doc.Type, text);

                    if (doc.Agency == "City Council" && doc.Type == "Minutes")
                    {
                        doc.Votes = GetVotes(doc.Agency, doc.Type, text);
                    }
                }
            }

            if (doc.Project == "")
            {
                if (text == null)
                {
                    text = "";
                }

                doc.Project = GetProject(file, text);

                if (doc.Project == "")
                {
                    doc.Project = GetProject(doc.Agency, doc.Type, file, text);
                }
            }

            //Console.WriteLine(text);
            Console.WriteLine(doc.Id);

            //collection.Add(doc);
            return(doc);
        }
コード例 #2
0
        private void BGW_Organize_DoWork(object sender, DoWorkEventArgs e)
        {
            Log.AddMessage("Beginning organization of files...", "Information");
            UpdateStatusLabel("Finding files...");

            EMDocs emdocs = new EMDocs();

            //List<string> eDocs = new List<string>();

            if (!Directory.Exists(tbOutputOrg.Text))
            {
                Directory.CreateDirectory(tbOutputOrg.Text);
            }

            if (Directory.Exists(tbBaseFolder.Text))
            {
                var files       = Directory.GetFiles(tbBaseFolder.Text, "*.pdf", SearchOption.AllDirectories);
                int currentEDoc = 1;

                if (files.Length == 0)
                {
                    Log.AddMessage("No files found to organize...", "Information");
                    MessageBox.Show("No files found to organize...", "Information");
                }

                foreach (var f in files)
                {
                    EMDoc eDoc = emdocs.AddDoc(f, sSource, cbOCR.Checked, (cbFillDate.Checked ? dateTimePicker1.Value : DateTime.Now));

                    UpdateStatusLabel("Indexing source " + currentEDoc + " of " + files.Length + ":\n" + eDoc.Id + " - " + eDoc.Title);
                    //eDocs.Add(em.Id + " - " + em.Title);

                    //Filename
                    //string saveFile = eDoc.Id + "_" + eDoc.Date.ToShortDateString() + "_" + eDoc.Agency + "_" + eDoc.Type + "_" + eDoc.Title;
                    string ag        = eDoc.Agency;
                    string type      = eDoc.Type;
                    int    year      = eDoc.Date.Year;
                    int    month     = eDoc.Date.Month;
                    int    day       = eDoc.Date.Day;
                    string docDir    = "";
                    string relDocDir = "";
                    string xmlDir    = "";
                    string ocrDir    = "";
                    string relOcrDir = "";

                    docDir    = tbOutputOrg.Text + "\\docs\\";//"\\" + ag + "\\" + type + "\\" + year + "\\";
                    relDocDir = "\\docs\\";
                    xmlDir    = tbOutputOrg.Text + "\\xmls\\";
                    ocrDir    = tbOutputOrg.Text + "\\ocr\\";
                    relOcrDir = "\\ocr\\";

                    string shortDate = eDoc.Date.ToShortDateString();

                    if (eDoc.Agency == "" || eDoc.Type == "" || year == 0 || eDoc.Title == "" || shortDate == "1/1/0001")
                    {
                        docDir    = tbOutputOrg.Text + "\\_incomplete\\";
                        relDocDir = "\\_incomplete\\";
                        xmlDir    = tbOutputOrg.Text + "\\_incomplete\\";
                        ocrDir    = tbOutputOrg.Text + "\\_incomplete\\";
                        relOcrDir = "\\_incomplete\\";
                    }

                    UpdateStatusLabel("Saving output " + currentEDoc + " of " + files.Length + ":\n" + eDoc.Id + " - " + eDoc.Title);

                    if (!Directory.Exists(docDir))
                    {
                        Directory.CreateDirectory(docDir);
                    }

                    if (!Directory.Exists(xmlDir))
                    {
                        Directory.CreateDirectory(xmlDir);
                    }

                    if (cbOCR.Checked)
                    {
                        if (!Directory.Exists(ocrDir))
                        {
                            Directory.CreateDirectory(ocrDir);
                        }
                    }

                    //Path to save to
                    File.Copy(eDoc.Filename, docDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension, true); //tbOutputOrg.Text + "\\" + ag + "\\" + year + "\\" + month + "\\" + RemoveProbChars(saveFile) + eDoc.Extension, true);

                    UpdateStatusLabel("Building index " + currentEDoc + " of " + files.Length + ":\n" + eDoc.Id + " - " + eDoc.Title);

                    XmlDocument    xml = new XmlDocument(); // Create the XML Declaration, and append it to XML eDoc
                    XmlDeclaration dec = xml.CreateXmlDeclaration("1.0", "UTF-8", null);
                    xml.AppendChild(dec);                   // Create the root element
                    XmlElement root = xml.CreateElement("EMDoc");
                    xml.AppendChild(root);

                    XmlElement id = xml.CreateElement("WebsiteID");
                    id.InnerText = eDoc.Id.ToString();
                    root.AppendChild(id);

                    XmlElement mime = xml.CreateElement("Mime");
                    mime.InnerText = eDoc.Extension;
                    root.AppendChild(mime);

                    XmlElement source = xml.CreateElement("Source");
                    source.InnerText = eDoc.Source;
                    root.AppendChild(source);

                    XmlElement sourceURL = xml.CreateElement("SourceURL");

                    if (sSource == "emcity.org")
                    {
                        if (eDoc.Extension.ToLower() == ".pdf")
                        {
                            sourceURL.InnerText = @"http://www.eaglemountaincity.org/Home/ShowDocument?id=" + eDoc.Id.ToString();
                        }
                        else
                        {
                            sourceURL.InnerText = @"http://www.eaglemountaincity.org/Home/ShowImage?id=" + eDoc.Id.ToString();
                        }
                    }

                    root.AppendChild(sourceURL);

                    XmlElement path = xml.CreateElement("Path");
                    path.InnerText = relDocDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension;
                    //path.InnerText = tbOutputFolder.Text + "\\" + eDoc.Id + "_" + eDoc.Date.ToShortDateString() + "_" + eDoc.Title + eDoc.Extension;
                    root.AppendChild(path);

                    XmlElement filename = xml.CreateElement("Filename");
                    filename.InnerText = Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension;
                    root.AppendChild(filename);

                    if (cbOCR.Checked)
                    {
                        XmlElement ocr = xml.CreateElement("OCR");
                        ocr.InnerText = relOcrDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".txt";
                        root.AppendChild(ocr);
                    }

                    XmlElement flds = xml.CreateElement("Fields");
                    root.AppendChild(flds);

                    XmlElement docName = xml.CreateElement("Title");
                    docName.InnerText = eDoc.Title;
                    flds.AppendChild(docName);

                    XmlElement date = xml.CreateElement("Date");
                    date.InnerText = eDoc.Date.ToShortDateString();
                    flds.AppendChild(date);

                    XmlElement yr = xml.CreateElement("Year");
                    yr.InnerText = year.ToString();
                    flds.AppendChild(yr);

                    XmlElement agency = xml.CreateElement("Agency");
                    agency.InnerText = eDoc.Agency;
                    flds.AppendChild(agency);

                    XmlElement docType = xml.CreateElement("DocType");
                    docType.InnerText = eDoc.Type;
                    flds.AppendChild(docType);

                    XmlElement docTypeDet = xml.CreateElement("DocTypeDetail");
                    docTypeDet.InnerText = eDoc.TypeDet;
                    flds.AppendChild(docTypeDet);

                    if (sProj != "")
                    {
                        if (eDoc.Project == "")
                        {
                            XmlElement project = xml.CreateElement("Project");
                            project.InnerText = sProj;
                            flds.AppendChild(project);
                        }
                        {
                            XmlElement project = xml.CreateElement("Project");
                            project.InnerText = eDoc.Project;
                            flds.AppendChild(project);
                        }
                    }
                    else
                    {
                        XmlElement project = xml.CreateElement("Project");
                        project.InnerText = eDoc.Project;
                        flds.AppendChild(project);
                    }


                    XmlElement officials = xml.CreateElement("Officials");
                    officials.InnerText = eDoc.Officials;
                    flds.AppendChild(officials);

                    XmlElement staff = xml.CreateElement("Staff");
                    staff.InnerText = eDoc.Staff;
                    flds.AppendChild(staff);

                    XmlElement votes = xml.CreateElement("Votes");
                    votes.InnerText = eDoc.Votes;
                    flds.AppendChild(votes);

                    xml.Save(xmlDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".xml"); //tbOutputOrg.Text + "\\" + ag + "\\" + year + "\\" + month + "\\" + RemoveProbChars(saveFile) + ".xml");

                    if (cbOCR.Checked)
                    {
                        if (eDoc.OCRText != "")
                        {
                            UpdateStatusLabel("Saving OCR " + currentEDoc + " of " + files.Length + ":\n" + eDoc.Id + " - " + eDoc.Title);

                            using (StreamWriter file = new System.IO.StreamWriter(ocrDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".txt", false))
                            {
                                file.Write(eDoc.OCRText);
                            }
                        }
                    }

                    currentEDoc++;
                }
            }
            else
            {
                Console.WriteLine("Base Directory Doesn't Exist");
            }

            //int currentEDoc = 1;

            //Log.AddMessage(eDocs.Count + " files found to organize", "Information");

            //foreach (EMDoc eDoc in eDocs)
            //{
            //    UpdateStatusLabel("Saving files and indexes " + currentEDoc + " of " + emdocs.CollectionCount() + ":\n" + eDoc.Id + " - " + eDoc.Title);

            //    //Filename
            //    //string saveFile = eDoc.Id + "_" + eDoc.Date.ToShortDateString() + "_" + eDoc.Agency + "_" + eDoc.Type + "_" + eDoc.Title;
            //    string ag = eDoc.Agency;
            //    string type = eDoc.Type;
            //    int year = eDoc.Date.Year;
            //    int month = eDoc.Date.Month;
            //    int day = eDoc.Date.Day;
            //    string docDir = "";
            //    string relDocDir = "";
            //    string xmlDir = "";
            //    string ocrDir = "";
            //    string relOcrDir = "";

            //    docDir = tbOutputOrg.Text + "\\docs\\";//"\\" + ag + "\\" + type + "\\" + year + "\\";
            //    relDocDir = "\\docs\\";
            //    xmlDir = tbOutputOrg.Text + "\\xmls\\";
            //    ocrDir = tbOutputOrg.Text + "\\ocr\\";
            //    relOcrDir = "\\ocr\\";

            //    string shortDate = eDoc.Date.ToShortDateString();

            //    if (eDoc.Agency == "" || eDoc.Type == "" || year == 0 || eDoc.Title == "" || shortDate == "1/1/0001")
            //    {
            //        docDir = tbOutputOrg.Text + "\\_incomplete\\";
            //        relDocDir = "\\_incomplete\\";
            //        xmlDir = tbOutputOrg.Text + "\\_incomplete\\";
            //        ocrDir = tbOutputOrg.Text + "\\_incomplete\\";
            //        relOcrDir =  "\\_incomplete\\";
            //    }

            //    if (!Directory.Exists(docDir))
            //    {
            //        Directory.CreateDirectory(docDir);
            //    }

            //    if (!Directory.Exists(xmlDir))
            //    {
            //        Directory.CreateDirectory(xmlDir);
            //    }

            //    if (!Directory.Exists(ocrDir))
            //    {
            //        Directory.CreateDirectory(ocrDir);
            //    }

            //    //Path to save to
            //    File.Copy(eDoc.Filename, docDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension, true); //tbOutputOrg.Text + "\\" + ag + "\\" + year + "\\" + month + "\\" + RemoveProbChars(saveFile) + eDoc.Extension, true);

            //    XmlDocument xml = new XmlDocument();// Create the XML Declaration, and append it to XML eDoc
            //    XmlDeclaration dec = xml.CreateXmlDeclaration("1.0", "UTF-8", null);
            //    xml.AppendChild(dec);// Create the root element
            //    XmlElement root = xml.CreateElement("EMDoc");
            //    xml.AppendChild(root);

            //    XmlElement id = xml.CreateElement("WebsiteID");
            //    id.InnerText = eDoc.Id.ToString();
            //    root.AppendChild(id);

            //    XmlElement mime = xml.CreateElement("Mime");
            //    mime.InnerText = eDoc.Extension;
            //    root.AppendChild(mime);

            //    XmlElement source = xml.CreateElement("Source");
            //    source.InnerText = eDoc.Source;
            //    root.AppendChild(source);

            //    XmlElement sourceURL = xml.CreateElement("SourceURL");

            //    if (eDoc.Extension.ToLower() == ".pdf")
            //    {
            //        sourceURL.InnerText = @"http://www.eaglemountaincity.org/Home/ShowDocument?id=" + eDoc.Id.ToString();
            //    }
            //    else
            //    {
            //        sourceURL.InnerText = @"http://www.eaglemountaincity.org/Home/ShowImage?id=" + eDoc.Id.ToString();
            //    }

            //    root.AppendChild(sourceURL);

            //    XmlElement path = xml.CreateElement("Path");
            //    path.InnerText = relDocDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension;
            //    //path.InnerText = tbOutputFolder.Text + "\\" + eDoc.Id + "_" + eDoc.Date.ToShortDateString() + "_" + eDoc.Title + eDoc.Extension;
            //    root.AppendChild(path);

            //    XmlElement filename = xml.CreateElement("Filename");
            //    filename.InnerText = Path.GetFileNameWithoutExtension(eDoc.Filename) + eDoc.Extension;
            //    root.AppendChild(filename);

            //    if (cbOCR.Checked)
            //    {
            //        XmlElement ocr = xml.CreateElement("OCR");
            //        ocr.InnerText = relOcrDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".txt";
            //        root.AppendChild(ocr);
            //    }

            //    XmlElement flds = xml.CreateElement("Fields");
            //    root.AppendChild(flds);

            //        XmlElement docName = xml.CreateElement("Title");
            //        docName.InnerText = eDoc.Title;
            //        flds.AppendChild(docName);

            //        XmlElement date = xml.CreateElement("Date");
            //        date.InnerText = eDoc.Date.ToShortDateString();
            //        flds.AppendChild(date);

            //        XmlElement yr = xml.CreateElement("Year");
            //        yr.InnerText = year.ToString();
            //        flds.AppendChild(yr);

            //        XmlElement agency = xml.CreateElement("Agency");
            //        agency.InnerText = eDoc.Agency;
            //        flds.AppendChild(agency);

            //        XmlElement docType = xml.CreateElement("DocType");
            //        docType.InnerText = eDoc.Type;
            //        flds.AppendChild(docType);

            //        XmlElement docTypeDet = xml.CreateElement("DocTypeDetail");
            //        docTypeDet.InnerText = eDoc.TypeDet;
            //        flds.AppendChild(docTypeDet);

            //        XmlElement project = xml.CreateElement("Project");
            //        project.InnerText = eDoc.Project;
            //        flds.AppendChild(project);

            //        XmlElement officials = xml.CreateElement("Officials");
            //        officials.InnerText = eDoc.Officials;
            //        flds.AppendChild(officials);

            //        XmlElement staff = xml.CreateElement("Staff");
            //        staff.InnerText = eDoc.Staff;
            //        flds.AppendChild(staff);

            //        XmlElement votes = xml.CreateElement("Votes");
            //        votes.InnerText = eDoc.Votes;
            //        flds.AppendChild(votes);

            //    xml.Save(xmlDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".xml"); //tbOutputOrg.Text + "\\" + ag + "\\" + year + "\\" + month + "\\" + RemoveProbChars(saveFile) + ".xml");

            //    if (cbOCR.Checked)
            //    {
            //        if (eDoc.OCRText != "")
            //        {
            //            using (StreamWriter file = new System.IO.StreamWriter(ocrDir + Path.GetFileNameWithoutExtension(eDoc.Filename) + ".txt", false))
            //            {
            //                file.Write(eDoc.OCRText);
            //            }
            //        }
            //    }

            //    currentEDoc++;
            //}
        }