Beispiel #1
0
        static void Main(string[] args)
        {
            //Read the input file into a string and make it readable by the program
            StreamReader sr = new StreamReader("Data.htm");
            string       x  = sr.ReadToEnd();

            sr.Close();
            //Extract number of elements
            HtmlDocument inidoc = new HtmlDocument();

            inidoc.LoadHtml(x);
            string y     = inidoc.DocumentNode.SelectSingleNode("//body").InnerHtml;
            int    count = y.CountSubstring("WordSection");

            System.Console.WriteLine(count);

            //further processing to speeed functions
            string data = x.Replace("class=WordSection", "id=WordSection").Replace("\n", "").Replace("\r", "").Replace("\t", "").Replace(" ", "").Replace("�", "").Replace("  ", " ");

            //read the file as a HtmlDocument
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(data);
            helpers.RemoveAttributes(doc);

            DataElementFile    def = new DataElementFile();
            List <DataElement> ele = new List <DataElement>();

            def.DataElements = ele;
            Domain dm = new Domain();

            dm.domain    = "Education";
            dm.acronym   = "edu";
            dm.version   = "2017-09-26_15:54";
            dm.sourceURL = @"http://heimshelp.education.gov.au/sites/heimshelp/2018_Data_Requirements/2018DataElements/Documents/2018-HE-Data-Element-Dictionary.docx";
            List <OutputDataElement> listOdm = new List <OutputDataElement>();
            OutputDataElement        odm;

            if (true)
            {
                for (int i = 3; i < count; i++)
                {
                    System.Console.WriteLine(i);
                    String      elementData = doc.GetElementbyId("WordSection" + i).InnerHtml;
                    DataElement asdf        = helpers.ConvertHTML2DataElement(elementData);
                    ele.Add(asdf);
                }

                foreach (DataElement dmf in def.DataElements)
                {
                    odm            = new OutputDataElement();
                    odm.Name       = dmf.ElementName;
                    odm.Domain     = "Education";
                    odm.Status     = "Standard";
                    odm.Definition = dmf.Description;
                    odm.dataType   = new outputDataType()
                    {
                        facets = new facet(), type = dmf.CodeFormat.First().Value
                    };
                    odm.sourceURL  = "http://heimshelp.education.gov.au/sites/heimshelp/2018_data_requirements/2018dataelements/pages/" + dmf.ElementNumber;
                    odm.identifier = "http://dxa.gov.au/definition/" + dm.acronym + "/" + dm.acronym + dmf.ElementNumber;
                    odm.guidance   = "Field Name: " + dmf.FieldName;
                    odm.usage      = new List <string>();
                    odm.usage.Add("See source for more information");
                    odm.values = new List <string>();
                    listOdm.Add(odm);
                }
                dm.content = listOdm;



                StreamWriter sw = new StreamWriter("edu.json", false);
                sw.Write(JsonConvert.SerializeObject(dm));
                sw.Close();
            }
            if (false)
            {
                String      elementData = doc.GetElementbyId("WordSection16").InnerHtml;
                DataElement asdf        = helpers.ConvertHTML2DataElement(elementData);
                System.Console.WriteLine(JsonConvert.SerializeObject(asdf));
            }

            System.Console.WriteLine("Press any key to exit");

            Console.ReadLine();
        }
Beispiel #2
0
        public static DataElement ConvertHTML2DataElement(string html)
        {
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(html);
            string sx = doc.DocumentNode.InnerText;

            sx = sx.Substring(sx.IndexOf("ELEMENT NO.") + 12, 3);

            List <HtmlNode>         tables       = helpers.SplitTables(doc);
            List <ExtendedHTMLNode> tablesMerged = helpers.MergeTablesOnHeaders(tables);
            DataElement             de           = new DataElement();

            de.ElementNumber = sx;
            foreach (ExtendedHTMLNode d in tablesMerged)
            {
                if (d.type == ElementType.Vers)
                {
                    de.Version = d.node.InnerText.Replace("VERSION:", "").Trim();
                }
                if (d.type == ElementType.FYear)
                {
                    de.FirstYear = d.node.InnerText.Replace("FIRST YEAR:", "").Trim();
                }
                if (d.type == ElementType.LYear)
                {
                    de.LastYear = d.node.InnerText.Replace("LAST YEAR:", "").Trim();
                }
                if (d.type == ElementType.Fld)
                {
                    de.FieldName = d.node.InnerText.Replace("FIELD NAME:", "").Trim();
                }
                if (d.type == ElementType.EleNM)
                {
                    de.ElementName = d.node.InnerText.Replace("ELEMENT NAME:", "").Trim();
                }
                if (d.type == ElementType.Desc)
                {
                    de.Description = d.node.InnerText.Replace("DESCRIPTION:", "").Trim();
                }
                if (d.type == ElementType.Frmt)
                {
                    int indexOfDataType = 0;
                    int indexOfUnits    = 0;
                    int indexOfWidth    = 0;

                    indexOfDataType = d.node.InnerText.IndexOf("Data Type:");
                    indexOfUnits    = d.node.InnerText.IndexOf("Units:");
                    indexOfWidth    = d.node.InnerText.IndexOf("Width:");
                    List <KeyValue> kv         = new List <KeyValue>();
                    KeyValue        kvDataType = new KeyValue();
                    KeyValue        kvUnits    = new KeyValue();
                    KeyValue        kvWidth    = new KeyValue();
                    kvDataType.Attr  = d.node.InnerText.Substring(indexOfDataType, 9).Trim();
                    kvDataType.Value = d.node.InnerText.Substring(indexOfDataType + 10, indexOfUnits - (indexOfDataType + 11)).Trim();
                    kvUnits.Attr     = d.node.InnerText.Substring(indexOfUnits, 5).Trim();
                    kvUnits.Value    = d.node.InnerText.Substring(indexOfUnits + 6, indexOfWidth - (indexOfUnits + 7)).Trim();
                    kvWidth.Attr     = d.node.InnerText.Substring(indexOfWidth, 5).Trim();
                    kvWidth.Value    = d.node.InnerText.Substring(indexOfWidth + 6).Trim();
                    kv.Add(kvDataType);
                    kv.Add(kvUnits);
                    kv.Add(kvWidth);
                    de.CodeFormat = kv;
                }
                if (d.type == ElementType.Clas)
                {
                    bool            lastLoopAdded = false;
                    List <KeyValue> kv            = new List <KeyValue>();
                    KeyValue        kvInstance;

                    List <string> para = d.node.InnerHtml.GetParagraphsListFromHtml();

                    try
                    {
                        for (int i = 1; i < para.Count(); i++)
                        {
                            if (!string.IsNullOrWhiteSpace(para[i]))
                            {
                                if (!lastLoopAdded)
                                {
                                    kvInstance       = new KeyValue();
                                    kvInstance.Attr  = para[i];
                                    kvInstance.Value = para[i + 1];
                                    if (!kv.Contains(kvInstance))
                                    {
                                        kv.Add(kvInstance);
                                    }
                                    lastLoopAdded = true;
                                }
                                else
                                {
                                    lastLoopAdded = false;
                                }
                            }
                        }
                    }
                    catch {
                        kv.Add(new KeyValue {
                            Attr = "SystemIssue", Value = "Error reading" + d.type.ToString()
                        });
                    }
                    de.Classification = kv;
                }
                if (d.type == ElementType.CoNt)
                {
                    de.CodingNotes = d.node.InnerText;
                }
                if (d.type == ElementType.InFi)
                {
                    int           indexOfVERSION = 0;
                    List <string> para           = d.node.InnerHtml.GetParagraphsListFromHtml();
                    List <string> inputFiles     = new List <string>();
                    foreach (string xyz in para)
                    {
                        if (xyz.Equals("VERSION"))
                        {
                            indexOfVERSION = para.IndexOf(xyz);
                            break;
                        }
                        string[] excludeWords = new string[]
                        { "INPUT FILES:",
                          "HEP - Student",
                          "HEP - Staff",
                          "HEP - Applications and Offers" };
                        if (!excludeWords.Contains(xyz))
                        {
                            if (!inputFiles.Contains(xyz))
                            {
                                inputFiles.Add(xyz);
                            }
                        }
                    }
                    de.InputFiles = inputFiles;

                    List <ChangeRecord> changeHist = new List <ChangeRecord>();
                    ChangeRecord        cr;
                    try
                    {
                        for (int i = indexOfVERSION + 3; i < para.Count();)
                        {
                            if (!string.IsNullOrWhiteSpace(para[i]))
                            {
                                cr               = new ChangeRecord();
                                cr.Version       = para[i];
                                cr.RevisionDate  = para[i + 1];
                                cr.ReportingYear = para[i + 2];
                                changeHist.Add(cr);
                                i = i + 2;
                            }
                        }
                    }
                    catch {}
                    de.ChangeHistory = changeHist;
                }
                if (d.type == ElementType.CHis)
                {
                    System.Console.WriteLine(d.type.ToString());
                    System.Console.WriteLine(d.node.InnerHtml);
                }
            }
            return(de);
        }