Пример #1
0
        /// <summary>
        ///     Extrahuje hesla a podheslí z
        /// </summary>
        /// <param name="identifikatorDilu"></param>
        /// <param name="changeRuleSetFile"></param>
        public void TestExtrahujHesla(string inputFile, string outputFile, string identifikatorDilu, string changeRuleSetFile, bool generateHeader = true)
        {
            var assembly      = Assembly.GetExecutingAssembly();
            var changeRuleSet = ChangeRuleSet.Load(assembly.GetManifestResourceStream(changeRuleSetFile));

            var xws = new XmlWriterSettings();

            xws.Indent = true;
            var           identifikatory = new Dictionary <string, Guid>(10000);
            List <string> heslaPaginy    = null;

            //string sSoubor = @"D:\Slovniky\JgSlov\Data\JgSlov_Transkripce.xml";
            //Transformator trs = NactiNovaPravidla(sSoubor);

            using (var xw = XmlWriter.Create(outputFile, xws))
            {
                xw.WriteStartDocument();

                if (generateHeader)
                {
                    xw.WriteStartElement("TEI", "http://www.tei-c.org/ns/1.0");

                    //xw.WriteAttributeString("xmlns", "http://www.tei-c.org/ns/1.0");
                    var xd = new XmlDocument();
                    xd.LoadXml(DejHlavicku(identifikatorDilu));
                    xd.WriteContentTo(xw);
                }

                xw.WriteStartElement("facsimile", "http://www.tei-c.org/ns/1.0");
                xw.WriteAttributeString("n", identifikatorDilu);

                using (var xr = XmlReader.Create(inputFile))
                {
                    var pagina  = "0";
                    var pismeno = "M";

                    var divId    = "body";
                    var divLevel = 0;

                    string    lastEntryId = "";
                    XmlReader lastForm    = null;

                    var iHeslo        = 1;
                    var surfaceIsOpen = false;
                    var descIsOpen    = false;
                    xr.MoveToContent();

                    while (xr.Read())
                    {
                        var nodeName = xr.Name;
                        if (xr.NodeType == XmlNodeType.Element)
                        {
                            switch (nodeName)
                            {
                            case "div":
                                if (!xr.IsEmptyElement)
                                {
                                    divLevel++;
                                }
                                divId = xr.GetAttribute("xml:id");

                                break;

                            case "head":
                            case "Pismeno":
                                if (nodeName == "head" && divLevel != 2)
                                {
                                    break;
                                }

                                pismeno = Objekty.ReadCurrentNodeContentAsString(xr).ToUpper();
                                if (pismeno.IndexOf('(') > -1)
                                {
                                    pismeno = pismeno.Substring(0, pismeno.IndexOf('(') - 1).Trim();
                                }
                                if (pismeno.EndsWith("."))
                                {
                                    pismeno = pismeno.Substring(0, pismeno.Length - 1);
                                }

                                break;

                            case "pb":
                            case "Paginace":
                                heslaPaginy = new List <string>(120);
                                if (descIsOpen)
                                {
                                    xw.WriteEndElement();     //desc
                                    descIsOpen = false;
                                }
                                if (surfaceIsOpen)
                                {
                                    xw.WriteStartElement("graphic");

                                    if (identifikatorDilu != "DDBW")     //DDBW has not have graphic
                                    {
                                        xw.WriteAttributeString("url", DejNazevSouboru(pagina, identifikatorDilu));
                                    }

                                    xw.WriteEndElement();     //graphic

                                    xw.WriteEndElement();     //surface
                                    surfaceIsOpen = false;
                                }

                                pagina = nodeName == "pb"
                                        ? xr.GetAttribute("n")
                                        : Objekty.ReadCurrentNodeContentAsString(xr).Trim();
                                xw.WriteStartElement("surface");
                                surfaceIsOpen = true;
                                xw.WriteAttributeString("n", pagina);
                                xw.WriteStartElement("desc");
                                descIsOpen = true;

                                break;

                            case "entryFree":
                                lastEntryId = xr.GetAttribute("xml:id");

                                break;

                            case "form":
                                lastForm = xr.ReadSubtree();

                                break;

                            case "orth":
                            case "heslo":
                            case "podhesli":
                                var heslo = Objekty.ReadCurrentNodeContentAsString(xr).Trim();
                                heslo = UppercaseFirst(heslo);
                                var type = "main";

                                if (nodeName == "podhesli")
                                {
                                    type = "detail";
                                }

                                var pocatecniPismeno = RemoveNonLetters(heslo);

                                if (pocatecniPismeno.Length == 0)
                                {
                                    break;                                   //chyba - to by se nemělo stát; TODO
                                }
                                pocatecniPismeno = DejPocatecniPismeno(pocatecniPismeno, true);

                                // string identifikator = String.Format("{0}|{1}|{2}", heslo.ToLower(), type, pismeno);
                                var identifikator = string.Format("{0}|{1}|{2}", heslo.ToLower(), type, pocatecniPismeno);

                                if (!identifikatory.ContainsKey(identifikator))
                                {
                                    identifikatory.Add(identifikator, Guid.NewGuid());
                                }

                                //na stránce se v MDM nesmějí vyskytovat duplicitní heslo
                                //nebo to udělat tak, že (pod)heslo vždycky dostane jedinečné GUID
                                if (!heslaPaginy.Contains(identifikator))
                                {
                                    xw.WriteStartElement("term");
                                    xw.WriteAttributeString("id", identifikatory[identifikator].ToString("D"));
                                    xw.WriteAttributeString("type", type);

                                    //xw.WriteAttributeString("subtype", pismeno);
                                    xw.WriteAttributeString("subtype", pocatecniPismeno);

                                    string international = null;

                                    //international = trs.AplikujPravidla(heslo, "cze");

                                    if (nodeName == "orth")     //die if not exist parent <form> for <orth>
                                    {
                                        xw.WriteAttributeString("n", lastEntryId);

                                        while (lastForm.Read())
                                        {
                                            if (lastForm.NodeType == XmlNodeType.Element)
                                            {
                                                if (lastForm.Name == "reg")
                                                {
                                                    international = Objekty.ReadCurrentNodeContentAsString(lastForm).Trim();
                                                }
                                            }
                                        }
                                    }
                                    international = international ?? changeRuleSet.Apply(heslo);

                                    xw.WriteAttributeString("international", international);

                                    if (nodeName != "orth")
                                    {
                                        xw.WriteAttributeString("n", string.Format("{0:000000}", iHeslo++));
                                    }
                                    xw.WriteString(heslo);
                                    xw.WriteEndElement();     //term
                                    heslaPaginy.Add(identifikator);
                                }

                                break;

                            default:
                                break;
                            }
                        }
                        else if (xr.NodeType == XmlNodeType.EndElement)
                        {
                            switch (nodeName)
                            {
                            case "div":
                                divLevel--;

                                break;
                            }
                        }
                    }
                }
                xw.WriteEndElement(); //facsimile
                if (generateHeader)
                {
                    xw.WriteEndElement(); //TEI
                }
                xw.WriteEndDocument();
            }
        }
Пример #2
0
        public override void SeskupitHeslaPismene(string inputFile, string outputFile, string filenameWithoutExtension)
        {
            var assembly      = Assembly.GetExecutingAssembly();
            var changeRuleSet = ChangeRuleSet.Load(assembly.GetManifestResourceStream(m_changeRuleSetFile));

            var xws = new XmlWriterSettings {
                Indent = true
            };

            using (var xmlWriter = XmlWriter.Create(outputFile, xws))
                using (var xmlReader = XmlReader.Create(inputFile))
                {
                    xmlWriter.WriteStartDocument();

                    while (xmlReader.Read())
                    {
                        switch (xmlReader.NodeType)
                        {
                        case XmlNodeType.Element:
                            switch (xmlReader.Name)
                            {
                            case "reg":
                                if (xmlReader.GetAttribute("xml:compute-reg") != "true")
                                {
                                    Transformace.SerializeNode(xmlReader, xmlWriter);
                                }
                                else
                                {
                                    xmlWriter.WriteStartElement(xmlReader.Name);

                                    while (xmlReader.MoveToNextAttribute())
                                    {
                                        if (xmlReader.Name == "xml:compute-reg")
                                        {
                                            continue;
                                        }

                                        xmlWriter.WriteAttributeString(xmlReader.Prefix, xmlReader.LocalName, xmlReader.NamespaceURI, xmlReader.Value);
                                    }

                                    xmlReader.MoveToContent();

                                    xmlWriter.WriteString(changeRuleSet.Apply(xmlReader.ReadInnerXml()));

                                    xmlWriter.WriteEndElement();
                                }

                                break;

                            default:
                                Transformace.SerializeNode(xmlReader, xmlWriter);

                                break;
                            }

                            break;

                        case XmlNodeType.EndElement:
                            switch (xmlReader.Name)
                            {
                            case "teiHeader":
                                Transformace.SerializeNode(xmlReader, xmlWriter);

                                break;

                            default:
                                Transformace.SerializeNode(xmlReader, xmlWriter);

                                break;
                            }

                            break;

                        default:
                            Transformace.SerializeNode(xmlReader, xmlWriter);

                            break;
                        }
                    }

                    xmlWriter.WriteEndDocument();
                }
        }