/// <summary> /// Extrahuje hesla a podheslí z /// </summary> /// <param name="identifikatorDilu"></param> /// <param name="changeRuleSetFile"></param> public void TestExtrahujHesla(string inputFile, string outputFile, string identifikatorDilu, string changeRuleSetFile, bool generateHeader = true) { var assembly = Assembly.GetExecutingAssembly(); var changeRuleSet = ChangeRuleSet.Load(assembly.GetManifestResourceStream(changeRuleSetFile)); var xws = new XmlWriterSettings(); xws.Indent = true; var identifikatory = new Dictionary <string, Guid>(10000); List <string> heslaPaginy = null; //string sSoubor = @"D:\Slovniky\JgSlov\Data\JgSlov_Transkripce.xml"; //Transformator trs = NactiNovaPravidla(sSoubor); using (var xw = XmlWriter.Create(outputFile, xws)) { xw.WriteStartDocument(); if (generateHeader) { xw.WriteStartElement("TEI", "http://www.tei-c.org/ns/1.0"); //xw.WriteAttributeString("xmlns", "http://www.tei-c.org/ns/1.0"); var xd = new XmlDocument(); xd.LoadXml(DejHlavicku(identifikatorDilu)); xd.WriteContentTo(xw); } xw.WriteStartElement("facsimile", "http://www.tei-c.org/ns/1.0"); xw.WriteAttributeString("n", identifikatorDilu); using (var xr = XmlReader.Create(inputFile)) { var pagina = "0"; var pismeno = "M"; var divId = "body"; var divLevel = 0; string lastEntryId = ""; XmlReader lastForm = null; var iHeslo = 1; var surfaceIsOpen = false; var descIsOpen = false; xr.MoveToContent(); while (xr.Read()) { var nodeName = xr.Name; if (xr.NodeType == XmlNodeType.Element) { switch (nodeName) { case "div": if (!xr.IsEmptyElement) { divLevel++; } divId = xr.GetAttribute("xml:id"); break; case "head": case "Pismeno": if (nodeName == "head" && divLevel != 2) { break; } pismeno = Objekty.ReadCurrentNodeContentAsString(xr).ToUpper(); if (pismeno.IndexOf('(') > -1) { pismeno = pismeno.Substring(0, pismeno.IndexOf('(') - 1).Trim(); } if (pismeno.EndsWith(".")) { pismeno = pismeno.Substring(0, pismeno.Length - 1); } break; case "pb": case "Paginace": heslaPaginy = new List <string>(120); if (descIsOpen) { xw.WriteEndElement(); //desc descIsOpen = false; } if (surfaceIsOpen) { xw.WriteStartElement("graphic"); if (identifikatorDilu != "DDBW") //DDBW has not have graphic { xw.WriteAttributeString("url", DejNazevSouboru(pagina, identifikatorDilu)); } xw.WriteEndElement(); //graphic xw.WriteEndElement(); //surface surfaceIsOpen = false; } pagina = nodeName == "pb" ? xr.GetAttribute("n") : Objekty.ReadCurrentNodeContentAsString(xr).Trim(); xw.WriteStartElement("surface"); surfaceIsOpen = true; xw.WriteAttributeString("n", pagina); xw.WriteStartElement("desc"); descIsOpen = true; break; case "entryFree": lastEntryId = xr.GetAttribute("xml:id"); break; case "form": lastForm = xr.ReadSubtree(); break; case "orth": case "heslo": case "podhesli": var heslo = Objekty.ReadCurrentNodeContentAsString(xr).Trim(); heslo = UppercaseFirst(heslo); var type = "main"; if (nodeName == "podhesli") { type = "detail"; } var pocatecniPismeno = RemoveNonLetters(heslo); if (pocatecniPismeno.Length == 0) { break; //chyba - to by se nemělo stát; TODO } pocatecniPismeno = DejPocatecniPismeno(pocatecniPismeno, true); // string identifikator = String.Format("{0}|{1}|{2}", heslo.ToLower(), type, pismeno); var identifikator = string.Format("{0}|{1}|{2}", heslo.ToLower(), type, pocatecniPismeno); if (!identifikatory.ContainsKey(identifikator)) { identifikatory.Add(identifikator, Guid.NewGuid()); } //na stránce se v MDM nesmějí vyskytovat duplicitní heslo //nebo to udělat tak, že (pod)heslo vždycky dostane jedinečné GUID if (!heslaPaginy.Contains(identifikator)) { xw.WriteStartElement("term"); xw.WriteAttributeString("id", identifikatory[identifikator].ToString("D")); xw.WriteAttributeString("type", type); //xw.WriteAttributeString("subtype", pismeno); xw.WriteAttributeString("subtype", pocatecniPismeno); string international = null; //international = trs.AplikujPravidla(heslo, "cze"); if (nodeName == "orth") //die if not exist parent <form> for <orth> { xw.WriteAttributeString("n", lastEntryId); while (lastForm.Read()) { if (lastForm.NodeType == XmlNodeType.Element) { if (lastForm.Name == "reg") { international = Objekty.ReadCurrentNodeContentAsString(lastForm).Trim(); } } } } international = international ?? changeRuleSet.Apply(heslo); xw.WriteAttributeString("international", international); if (nodeName != "orth") { xw.WriteAttributeString("n", string.Format("{0:000000}", iHeslo++)); } xw.WriteString(heslo); xw.WriteEndElement(); //term heslaPaginy.Add(identifikator); } break; default: break; } } else if (xr.NodeType == XmlNodeType.EndElement) { switch (nodeName) { case "div": divLevel--; break; } } } } xw.WriteEndElement(); //facsimile if (generateHeader) { xw.WriteEndElement(); //TEI } xw.WriteEndDocument(); } }
public override void SeskupitHeslaPismene(string inputFile, string outputFile, string filenameWithoutExtension) { var assembly = Assembly.GetExecutingAssembly(); var changeRuleSet = ChangeRuleSet.Load(assembly.GetManifestResourceStream(m_changeRuleSetFile)); var xws = new XmlWriterSettings { Indent = true }; using (var xmlWriter = XmlWriter.Create(outputFile, xws)) using (var xmlReader = XmlReader.Create(inputFile)) { xmlWriter.WriteStartDocument(); while (xmlReader.Read()) { switch (xmlReader.NodeType) { case XmlNodeType.Element: switch (xmlReader.Name) { case "reg": if (xmlReader.GetAttribute("xml:compute-reg") != "true") { Transformace.SerializeNode(xmlReader, xmlWriter); } else { xmlWriter.WriteStartElement(xmlReader.Name); while (xmlReader.MoveToNextAttribute()) { if (xmlReader.Name == "xml:compute-reg") { continue; } xmlWriter.WriteAttributeString(xmlReader.Prefix, xmlReader.LocalName, xmlReader.NamespaceURI, xmlReader.Value); } xmlReader.MoveToContent(); xmlWriter.WriteString(changeRuleSet.Apply(xmlReader.ReadInnerXml())); xmlWriter.WriteEndElement(); } break; default: Transformace.SerializeNode(xmlReader, xmlWriter); break; } break; case XmlNodeType.EndElement: switch (xmlReader.Name) { case "teiHeader": Transformace.SerializeNode(xmlReader, xmlWriter); break; default: Transformace.SerializeNode(xmlReader, xmlWriter); break; } break; default: Transformace.SerializeNode(xmlReader, xmlWriter); break; } } xmlWriter.WriteEndDocument(); } }