示例#1
0
        // ------------------------------------------------------------------------------------------------------------
        // Name:   getXmlStats
        // Goal:   Get all the text from the <s><t> nodes and compute a hash code from it
        //         Also compute the number of words and sentences
        // History:
        // 04-02-2016  ERK Created
        // ------------------------------------------------------------------------------------------------------------
        public bool getXmlStats(String sFileIn, ref String sSimHash, List <String> lStat,
                                ref int iWords, ref int iSents)
        {
            String sMethod = "simhash";

            try {
                // Initialise
                iWords = 0; iSents = 0;
                bool bUseNext = false;
                // We need to have our own XmlDocument ready
                XmlDocument pdxLocal = new XmlDocument();
                // Prepare a string reader to read all we need
                StringBuilder sbThis = new StringBuilder();
                // Alsoo prepare a string reader to store potential StatusInfo
                StringBuilder sbStat = new StringBuilder();
                // Create an XmlReader to get to the <s><t> nodes...
                using (StreamReader rdFileTmp = new StreamReader(sFileIn))
                    using (XmlReader rdFolia = XmlReader.Create(rdFileTmp)) {
                        // (1) Walk through the bare folia input file
                        while (!rdFolia.EOF && rdFolia.Read())
                        {
                            // (2) Check the input element
                            if (rdFolia.IsStartElement("t"))
                            {
                                // It needs to have an attribute [class]
                                if (rdFolia.HasAttributes)
                                {
                                    // Get the @class attribute
                                    String sClass = rdFolia.GetAttribute("class");
                                    // Check the value
                                    if (sClass == "nld" || sClass == "nl")
                                    {
                                        // Correct attribute: read the node
                                        String sContent = rdFolia.ReadInnerXml();
                                        String sLine    = sContent + "\n";
                                        sbThis.Append(sLine);
                                        // Check for StatusInfo
                                        if (bUseNext)
                                        {
                                            sbStat.Append(" // " + sContent);
                                            // Add to the list of statusinfo evidence
                                            lStat.Add(sbStat.ToString());
                                            sbStat.Clear();
                                            bUseNext = false;
                                        }
                                        else if (General.DoLike(sLine.ToLower(),
                                                                "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*|*ripped*|*download*|*copyright*"))
                                        {
                                            // Is this the first one?
                                            if (sbStat.Length > 0)
                                            {
                                                sbStat.Append("\n");
                                            }
                                            sbStat.Append(sContent);
                                            bUseNext = true;
                                        }
                                    }
                                }
                            }
                            else if (rdFolia.IsStartElement("w"))
                            {
                                // Get the @class attribute
                                String sClass = rdFolia.GetAttribute("class");
                                if (sClass == "Vern")
                                {
                                    iWords += 1;
                                }
                            }
                            else if (rdFolia.IsStartElement("s"))
                            {
                                // Keep track of the number of sentences
                                iSents += 1;
                            }
                        }
                    }
                // Create one string from the whole
                String sTotal = sbThis.ToString();
                // sStat = sbStat.ToString();

                // =============== DEBUG ===============
                // Store the string into a text file
                File.WriteAllText(sFileIn + ".txt", sTotal, System.Text.Encoding.UTF8);
                // =====================================

                switch (sMethod)
                {
                case "md5":
                    // Method #1: compute the hash from this string
                    var          md5       = MD5.Create();
                    MemoryStream mStrm     = new MemoryStream(Encoding.UTF8.GetBytes(sTotal));
                    byte[]       hashBytes = md5.ComputeHash(mStrm);
                    // Convert the byte array to a hash string
                    sSimHash = ByteArrayToString(hashBytes);
                    break;

                case "simhash":
                    // Method #2: compute the simhash from this string
                    SimHashAnalyzer oAna = new SimHashAnalyzer();
                    // errHandle.Status("input = [" + sTotal + "]");
                    UInt64 iSimHash = oAna.DoCalculateSimHash(sTotal);
                    // Convert integer to string
                    sSimHash = Convert.ToString(iSimHash);
                    break;
                }
                return(true);
            } catch (Exception ex) {
                // Warn the user
                errHandle.DoError("xmlTools/getXmlStats", ex);
                return(false);
            }
        }
示例#2
0
        // ------------------------------------------------------------------------------------
        // Name:   eTreeSentence
        // Goal:   Re-analyze a whole sentence in the following way:
        //         1. Based on the content of the [eLeaf] nodes:
        //            a. Determine the <seg> text
        //            b. Determine @from and @to for the [eLeaf] nodes
        //         2. Determine @from and @to for all the [eTree] nodes again
        // History:
        // 03-01-2013  ERK Created
        // ------------------------------------------------------------------------------------
        public bool eTreeSentence(ref XmlNode ndxThis, ref XmlNode ndxNew, bool bVerbose = false,
                                  bool bOldEnglish = false, bool bDoOrg = true)
        {
            XmlNode ndxFor = null;          // My parent forest node
            // XmlNode ndxChild = null; // Working node
            XmlNodeList ndxList    = null;  // List of children
            XmlNode     ndxVern    = null;  // Vernacular text line
            XmlNode     ndxLeaf    = null;  // One working leaf
            int         intI       = 0;     // Counter
            int         intFrom    = 0;     // Word starting point
            int         intTo      = 0;     // End of word
            bool        bNeedSpace = false; // No space needed after this word
            bool        bChanged   = false; // Whether anythying has in fact changed
            string      strLine    = "";    // Text of this line

            try {
                // Validate something is selected
                if (ndxThis == null)
                {
                    return(false);
                }
                // Determine the parent forest node
                ndxFor = ndxThis.SelectSingleNode("./ancestor-or-self::forest[1]");
                if (ndxFor == null)
                {
                    return(false);
                }
                // Need to recalculate the "org" text?
                if (bDoOrg)
                {
                    // Get the vernacular text line
                    ndxVern = ndxFor.SelectSingleNode("./child::div[@lang='org']/seg");
                    if (ndxVern == null)
                    {
                        return(false);
                    }
                    // Get all the [eLeaf] children, but only if they have no CODE nor METADATA ancestor
                    ndxList = ndxFor.SelectNodes(".//descendant::eLeaf[count(ancestor::eTree[tb:matches(@Label, '" + strNoText + "')])=0]", XPathFunctions.conTb);
                    // Walk all the children
                    for (intI = 0; intI < ndxList.Count; intI++)
                    {
                        // ============ DEBUG =========
                        // If (intI = 11) Then Stop
                        // ============================
                        // Process this <eLeaf>
                        // Check if this <eLeaf> has the correct type
                        if ((ndxList[intI].Attributes["Type"].Value == "Punct") &&
                            (General.DoLike(ndxList[intI].Attributes["Text"].Value, "*[a-zA-Z]*")))
                        {
                            // It must be of type "Vern" instead
                            ndxList[intI].Attributes["Type"].Value = "Vern";
                        }
                        switch (ndxList[intI].Attributes["Type"].Value)
                        {
                        case "Vern":
                            // Need to add a space?
                            if (bNeedSpace)
                            {
                                strLine += " ";
                            }
                            // Get the starting point of the word
                            intFrom = strLine.Length;
                            // Add word to the text of this line
                            if (bOldEnglish)
                            {
                                strLine += VernToEnglish(ndxList[intI].Attributes["Text"].Value);
                            }
                            else
                            {
                                strLine += ndxList[intI].Attributes["Text"].Value;
                            }
                            // Get the correct ending point of the word
                            intTo = strLine.Length;
                            // Normally each word should be followed by a space
                            bNeedSpace = true;
                            break;

                        case "Punct":
                            // Are we supposed to add a space?
                            if (bNeedSpace)
                            {
                                // Check if this punctuation should be PRECEDED by a space
                                switch (ndxList[intI].Attributes["Text"].Value)
                                {
                                case ":":
                                case ",":
                                case ".":
                                case "!":
                                case "?":
                                case ";":
                                case ">>":
                                    // A space may NOT precede this punctuation
                                    break;

                                case "»":
                                    // A space may NOT precede this punctuation
                                    break;

                                case "«":
                                case "<<": // A space must precede this punctuation
                                    strLine += " ";
                                    break;

                                case "'":
                                case "\"":
                                    // Check if a word is preceding or not
                                    if (intI > 0)
                                    {
                                        // We are not at the beginning...
                                        if (ndxList[intI - 1].Attributes["Type"].Value != "Vern")
                                        {
                                            // There is NO word preceding, so DO add a space
                                            strLine += " ";
                                        }
                                    }
                                    break;

                                default:
                                    // In all other cases a space has to be added
                                    strLine += " ";
                                    break;
                                }
                            }
                            // Get the starting point of the word
                            intFrom = strLine.Length;
                            // Add word to the text of this line
                            strLine += ndxList[intI].Attributes["Text"].Value;
                            // Get the correct ending point of the word
                            intTo = strLine.Length;
                            // Check if this punctuation should be FOLLOWED by a space
                            switch (ndxList[intI].Attributes["Text"].Value)
                            {
                            case ":":
                            case ",":
                            case ".":
                            case "!":
                            case "?":
                            case ";":
                            case ">>":
                                // A space must follow
                                bNeedSpace = true;
                                break;

                            case "»":
                                // A space should follow this punctuation
                                bNeedSpace = true;
                                break;

                            case "«":
                                // A space should not follow
                                bNeedSpace = false;
                                break;

                            case "'":
                            case "\"":
                                // Check if a word is preceding or not
                                if (intI > 0)
                                {
                                    // We are not at the beginning...
                                    if (ndxList[intI - 1].Attributes["Type"].Value == "Vern")
                                    {
                                        // There is a word preceding, so DO add a space
                                        bNeedSpace = true;
                                    }
                                }
                                break;

                            default:
                                // Reset spacing
                                bNeedSpace = false;
                                break;
                            }
                            break;

                        case "Star":
                            // A star item must contain at least a space
                            intFrom = strLine.Length;
                            // Add this space
                            strLine   += " ";
                            bNeedSpace = false;
                            // Get the correct ending point of the word
                            intTo = strLine.Length;
                            break;

                        case "Zero":
                            // Get the starting point of the word
                            intFrom = strLine.Length;
                            intTo   = intFrom;
                            break;
                        }
                        // Validate existence of from and to
                        XmlNode ndxListItem = ndxList[intI];
                        if (ndxList[intI].Attributes["from"] == null)
                        {
                            oXmlTools.AddAttribute(ndxListItem, "from", "0");
                        }
                        if (ndxList[intI].Attributes["to"] == null)
                        {
                            oXmlTools.AddAttribute(ndxListItem, "to", "0");
                        }
                        // Adapt the start and end of the word
                        intFrom += 1;
                        if (ndxList[intI].Attributes["from"].Value != intFrom.ToString())
                        {
                            ndxList[intI].Attributes["from"].Value = intFrom.ToString();
                            bChanged = true;
                        }
                        if (ndxList[intI].Attributes["to"].Value != intTo.ToString())
                        {
                            ndxList[intI].Attributes["to"].Value = intTo.ToString();
                            bChanged = true;
                        }
                    }
                    // Adapt the sentence in the vernacular
                    ndxVern.InnerText = strLine;
                    // Make sure editor is set to dirty
                    // bEdtDirty = true;
                }
                // Get all the <eTree> nodes
                ndxList = ndxFor.SelectNodes("./descendant::eTree");
                // Treat them all
                for (intI = 0; intI < ndxList.Count; intI++)
                {
                    // Access this one
                    // Determine their @from and @to values
                    ndxLeaf = ndxList[intI].SelectSingleNode("./descendant::eLeaf[1]");
                    if (ndxLeaf != null)
                    {
                        // Double check
                        if (ndxLeaf.Attributes["from"] == null)
                        {
                            oXmlTools.AddXmlAttribute(pdxCurrentFile, ref ndxLeaf, "from", "0");
                        }
                        // Get the value
                        intFrom = Convert.ToInt32(ndxLeaf.Attributes["from"].Value);
                        // Validate
                        if (ndxList[intI].Attributes["from"] == null)
                        {
                            XmlNode ndxListItem = ndxList[intI];
                            oXmlTools.AddAttribute(ndxListItem, "from", intFrom.ToString());
                        }
                        else
                        {
                            // See if we need changing
                            if (ndxList[intI].Attributes["from"].Value != intFrom.ToString())
                            {
                                ndxList[intI].Attributes["from"].Value = intFrom.ToString();
                                bChanged = true;
                            }
                        }
                    }
                    ndxLeaf = ndxList[intI].SelectSingleNode("./descendant::eLeaf[last()]");
                    if (ndxLeaf != null)
                    {
                        // Double check
                        if (ndxLeaf.Attributes["to"] == null)
                        {
                            oXmlTools.AddXmlAttribute(pdxCurrentFile, ref ndxLeaf, "to", "0");
                        }
                        // Get the value
                        intTo = Convert.ToInt32(ndxLeaf.Attributes["to"].Value);
                        // Validate
                        if (ndxList[intI].Attributes["to"] == null)
                        {
                            XmlNode ndxListItem = ndxList[intI];
                            oXmlTools.AddAttribute(ndxListItem, "to", intTo.ToString());
                        }
                        else
                        {
                            // See if we need changing
                            if (ndxList[intI].Attributes["to"].Value != intTo.ToString())
                            {
                                ndxList[intI].Attributes["to"].Value = intTo.ToString();
                                bChanged = true;
                            }
                        }
                    }
                }
                // We end with the same node we started with
                // NO!! then we change it... ndxNew = ndxThis
                // Give message to user
                if ((bChanged) && (bVerbose))
                {
                    errHandle.Status("Word positions in line " + ndxFor.Attributes["forestId"].Value);
                    //Else
                    //  Logging("No changes were needed")
                }
                // Return success
                return(bChanged);
            } catch (Exception ex) {
                // Show error
                errHandle.DoError("modEditor/eTreeSentence", ex);
                // Return failure
                return(false);
            }
        }