// ------------------------------------------------------------------------------------------------------------ // Name: getXmlStats // Goal: Get all the text from the <s><t> nodes and compute a hash code from it // Also compute the number of words and sentences // History: // 04-02-2016 ERK Created // ------------------------------------------------------------------------------------------------------------ public bool getXmlStats(String sFileIn, ref String sSimHash, List <String> lStat, ref int iWords, ref int iSents) { String sMethod = "simhash"; try { // Initialise iWords = 0; iSents = 0; bool bUseNext = false; // We need to have our own XmlDocument ready XmlDocument pdxLocal = new XmlDocument(); // Prepare a string reader to read all we need StringBuilder sbThis = new StringBuilder(); // Alsoo prepare a string reader to store potential StatusInfo StringBuilder sbStat = new StringBuilder(); // Create an XmlReader to get to the <s><t> nodes... using (StreamReader rdFileTmp = new StreamReader(sFileIn)) using (XmlReader rdFolia = XmlReader.Create(rdFileTmp)) { // (1) Walk through the bare folia input file while (!rdFolia.EOF && rdFolia.Read()) { // (2) Check the input element if (rdFolia.IsStartElement("t")) { // It needs to have an attribute [class] if (rdFolia.HasAttributes) { // Get the @class attribute String sClass = rdFolia.GetAttribute("class"); // Check the value if (sClass == "nld" || sClass == "nl") { // Correct attribute: read the node String sContent = rdFolia.ReadInnerXml(); String sLine = sContent + "\n"; sbThis.Append(sLine); // Check for StatusInfo if (bUseNext) { sbStat.Append(" // " + sContent); // Add to the list of statusinfo evidence lStat.Add(sbStat.ToString()); sbStat.Clear(); bUseNext = false; } else if (General.DoLike(sLine.ToLower(), "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*|*ripped*|*download*|*copyright*")) { // Is this the first one? if (sbStat.Length > 0) { sbStat.Append("\n"); } sbStat.Append(sContent); bUseNext = true; } } } } else if (rdFolia.IsStartElement("w")) { // Get the @class attribute String sClass = rdFolia.GetAttribute("class"); if (sClass == "Vern") { iWords += 1; } } else if (rdFolia.IsStartElement("s")) { // Keep track of the number of sentences iSents += 1; } } } // Create one string from the whole String sTotal = sbThis.ToString(); // sStat = sbStat.ToString(); // =============== DEBUG =============== // Store the string into a text file File.WriteAllText(sFileIn + ".txt", sTotal, System.Text.Encoding.UTF8); // ===================================== switch (sMethod) { case "md5": // Method #1: compute the hash from this string var md5 = MD5.Create(); MemoryStream mStrm = new MemoryStream(Encoding.UTF8.GetBytes(sTotal)); byte[] hashBytes = md5.ComputeHash(mStrm); // Convert the byte array to a hash string sSimHash = ByteArrayToString(hashBytes); break; case "simhash": // Method #2: compute the simhash from this string SimHashAnalyzer oAna = new SimHashAnalyzer(); // errHandle.Status("input = [" + sTotal + "]"); UInt64 iSimHash = oAna.DoCalculateSimHash(sTotal); // Convert integer to string sSimHash = Convert.ToString(iSimHash); break; } return(true); } catch (Exception ex) { // Warn the user errHandle.DoError("xmlTools/getXmlStats", ex); return(false); } }
// ------------------------------------------------------------------------------------ // Name: eTreeSentence // Goal: Re-analyze a whole sentence in the following way: // 1. Based on the content of the [eLeaf] nodes: // a. Determine the <seg> text // b. Determine @from and @to for the [eLeaf] nodes // 2. Determine @from and @to for all the [eTree] nodes again // History: // 03-01-2013 ERK Created // ------------------------------------------------------------------------------------ public bool eTreeSentence(ref XmlNode ndxThis, ref XmlNode ndxNew, bool bVerbose = false, bool bOldEnglish = false, bool bDoOrg = true) { XmlNode ndxFor = null; // My parent forest node // XmlNode ndxChild = null; // Working node XmlNodeList ndxList = null; // List of children XmlNode ndxVern = null; // Vernacular text line XmlNode ndxLeaf = null; // One working leaf int intI = 0; // Counter int intFrom = 0; // Word starting point int intTo = 0; // End of word bool bNeedSpace = false; // No space needed after this word bool bChanged = false; // Whether anythying has in fact changed string strLine = ""; // Text of this line try { // Validate something is selected if (ndxThis == null) { return(false); } // Determine the parent forest node ndxFor = ndxThis.SelectSingleNode("./ancestor-or-self::forest[1]"); if (ndxFor == null) { return(false); } // Need to recalculate the "org" text? if (bDoOrg) { // Get the vernacular text line ndxVern = ndxFor.SelectSingleNode("./child::div[@lang='org']/seg"); if (ndxVern == null) { return(false); } // Get all the [eLeaf] children, but only if they have no CODE nor METADATA ancestor ndxList = ndxFor.SelectNodes(".//descendant::eLeaf[count(ancestor::eTree[tb:matches(@Label, '" + strNoText + "')])=0]", XPathFunctions.conTb); // Walk all the children for (intI = 0; intI < ndxList.Count; intI++) { // ============ DEBUG ========= // If (intI = 11) Then Stop // ============================ // Process this <eLeaf> // Check if this <eLeaf> has the correct type if ((ndxList[intI].Attributes["Type"].Value == "Punct") && (General.DoLike(ndxList[intI].Attributes["Text"].Value, "*[a-zA-Z]*"))) { // It must be of type "Vern" instead ndxList[intI].Attributes["Type"].Value = "Vern"; } switch (ndxList[intI].Attributes["Type"].Value) { case "Vern": // Need to add a space? if (bNeedSpace) { strLine += " "; } // Get the starting point of the word intFrom = strLine.Length; // Add word to the text of this line if (bOldEnglish) { strLine += VernToEnglish(ndxList[intI].Attributes["Text"].Value); } else { strLine += ndxList[intI].Attributes["Text"].Value; } // Get the correct ending point of the word intTo = strLine.Length; // Normally each word should be followed by a space bNeedSpace = true; break; case "Punct": // Are we supposed to add a space? if (bNeedSpace) { // Check if this punctuation should be PRECEDED by a space switch (ndxList[intI].Attributes["Text"].Value) { case ":": case ",": case ".": case "!": case "?": case ";": case ">>": // A space may NOT precede this punctuation break; case "»": // A space may NOT precede this punctuation break; case "«": case "<<": // A space must precede this punctuation strLine += " "; break; case "'": case "\"": // Check if a word is preceding or not if (intI > 0) { // We are not at the beginning... if (ndxList[intI - 1].Attributes["Type"].Value != "Vern") { // There is NO word preceding, so DO add a space strLine += " "; } } break; default: // In all other cases a space has to be added strLine += " "; break; } } // Get the starting point of the word intFrom = strLine.Length; // Add word to the text of this line strLine += ndxList[intI].Attributes["Text"].Value; // Get the correct ending point of the word intTo = strLine.Length; // Check if this punctuation should be FOLLOWED by a space switch (ndxList[intI].Attributes["Text"].Value) { case ":": case ",": case ".": case "!": case "?": case ";": case ">>": // A space must follow bNeedSpace = true; break; case "»": // A space should follow this punctuation bNeedSpace = true; break; case "«": // A space should not follow bNeedSpace = false; break; case "'": case "\"": // Check if a word is preceding or not if (intI > 0) { // We are not at the beginning... if (ndxList[intI - 1].Attributes["Type"].Value == "Vern") { // There is a word preceding, so DO add a space bNeedSpace = true; } } break; default: // Reset spacing bNeedSpace = false; break; } break; case "Star": // A star item must contain at least a space intFrom = strLine.Length; // Add this space strLine += " "; bNeedSpace = false; // Get the correct ending point of the word intTo = strLine.Length; break; case "Zero": // Get the starting point of the word intFrom = strLine.Length; intTo = intFrom; break; } // Validate existence of from and to XmlNode ndxListItem = ndxList[intI]; if (ndxList[intI].Attributes["from"] == null) { oXmlTools.AddAttribute(ndxListItem, "from", "0"); } if (ndxList[intI].Attributes["to"] == null) { oXmlTools.AddAttribute(ndxListItem, "to", "0"); } // Adapt the start and end of the word intFrom += 1; if (ndxList[intI].Attributes["from"].Value != intFrom.ToString()) { ndxList[intI].Attributes["from"].Value = intFrom.ToString(); bChanged = true; } if (ndxList[intI].Attributes["to"].Value != intTo.ToString()) { ndxList[intI].Attributes["to"].Value = intTo.ToString(); bChanged = true; } } // Adapt the sentence in the vernacular ndxVern.InnerText = strLine; // Make sure editor is set to dirty // bEdtDirty = true; } // Get all the <eTree> nodes ndxList = ndxFor.SelectNodes("./descendant::eTree"); // Treat them all for (intI = 0; intI < ndxList.Count; intI++) { // Access this one // Determine their @from and @to values ndxLeaf = ndxList[intI].SelectSingleNode("./descendant::eLeaf[1]"); if (ndxLeaf != null) { // Double check if (ndxLeaf.Attributes["from"] == null) { oXmlTools.AddXmlAttribute(pdxCurrentFile, ref ndxLeaf, "from", "0"); } // Get the value intFrom = Convert.ToInt32(ndxLeaf.Attributes["from"].Value); // Validate if (ndxList[intI].Attributes["from"] == null) { XmlNode ndxListItem = ndxList[intI]; oXmlTools.AddAttribute(ndxListItem, "from", intFrom.ToString()); } else { // See if we need changing if (ndxList[intI].Attributes["from"].Value != intFrom.ToString()) { ndxList[intI].Attributes["from"].Value = intFrom.ToString(); bChanged = true; } } } ndxLeaf = ndxList[intI].SelectSingleNode("./descendant::eLeaf[last()]"); if (ndxLeaf != null) { // Double check if (ndxLeaf.Attributes["to"] == null) { oXmlTools.AddXmlAttribute(pdxCurrentFile, ref ndxLeaf, "to", "0"); } // Get the value intTo = Convert.ToInt32(ndxLeaf.Attributes["to"].Value); // Validate if (ndxList[intI].Attributes["to"] == null) { XmlNode ndxListItem = ndxList[intI]; oXmlTools.AddAttribute(ndxListItem, "to", intTo.ToString()); } else { // See if we need changing if (ndxList[intI].Attributes["to"].Value != intTo.ToString()) { ndxList[intI].Attributes["to"].Value = intTo.ToString(); bChanged = true; } } } } // We end with the same node we started with // NO!! then we change it... ndxNew = ndxThis // Give message to user if ((bChanged) && (bVerbose)) { errHandle.Status("Word positions in line " + ndxFor.Attributes["forestId"].Value); //Else // Logging("No changes were needed") } // Return success return(bChanged); } catch (Exception ex) { // Show error errHandle.DoError("modEditor/eTreeSentence", ex); // Return failure return(false); } }