// ------------------------------------------------------------------------------------------------------------ // Name: getXmlStats // Goal: Get all the text from the <s><t> nodes and compute a hash code from it // Also compute the number of words and sentences // History: // 04-02-2016 ERK Created // ------------------------------------------------------------------------------------------------------------ public bool getXmlStats(String sFileIn, ref String sSimHash, List <String> lStat, ref int iWords, ref int iSents) { String sMethod = "simhash"; try { // Initialise iWords = 0; iSents = 0; bool bUseNext = false; // We need to have our own XmlDocument ready XmlDocument pdxLocal = new XmlDocument(); // Prepare a string reader to read all we need StringBuilder sbThis = new StringBuilder(); // Alsoo prepare a string reader to store potential StatusInfo StringBuilder sbStat = new StringBuilder(); // Create an XmlReader to get to the <s><t> nodes... using (StreamReader rdFileTmp = new StreamReader(sFileIn)) using (XmlReader rdFolia = XmlReader.Create(rdFileTmp)) { // (1) Walk through the bare folia input file while (!rdFolia.EOF && rdFolia.Read()) { // (2) Check the input element if (rdFolia.IsStartElement("t")) { // It needs to have an attribute [class] if (rdFolia.HasAttributes) { // Get the @class attribute String sClass = rdFolia.GetAttribute("class"); // Check the value if (sClass == "nld" || sClass == "nl") { // Correct attribute: read the node String sContent = rdFolia.ReadInnerXml(); String sLine = sContent + "\n"; sbThis.Append(sLine); // Check for StatusInfo if (bUseNext) { sbStat.Append(" // " + sContent); // Add to the list of statusinfo evidence lStat.Add(sbStat.ToString()); sbStat.Clear(); bUseNext = false; } else if (General.DoLike(sLine.ToLower(), "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*|*ripped*|*download*|*copyright*")) { // Is this the first one? if (sbStat.Length > 0) { sbStat.Append("\n"); } sbStat.Append(sContent); bUseNext = true; } } } } else if (rdFolia.IsStartElement("w")) { // Get the @class attribute String sClass = rdFolia.GetAttribute("class"); if (sClass == "Vern") { iWords += 1; } } else if (rdFolia.IsStartElement("s")) { // Keep track of the number of sentences iSents += 1; } } } // Create one string from the whole String sTotal = sbThis.ToString(); // sStat = sbStat.ToString(); // =============== DEBUG =============== // Store the string into a text file File.WriteAllText(sFileIn + ".txt", sTotal, System.Text.Encoding.UTF8); // ===================================== switch (sMethod) { case "md5": // Method #1: compute the hash from this string var md5 = MD5.Create(); MemoryStream mStrm = new MemoryStream(Encoding.UTF8.GetBytes(sTotal)); byte[] hashBytes = md5.ComputeHash(mStrm); // Convert the byte array to a hash string sSimHash = ByteArrayToString(hashBytes); break; case "simhash": // Method #2: compute the simhash from this string SimHashAnalyzer oAna = new SimHashAnalyzer(); // errHandle.Status("input = [" + sTotal + "]"); UInt64 iSimHash = oAna.DoCalculateSimHash(sTotal); // Convert integer to string sSimHash = Convert.ToString(iSimHash); break; } return(true); } catch (Exception ex) { // Warn the user errHandle.DoError("xmlTools/getXmlStats", ex); return(false); } }
/* ------------------------------------------------------------------------------------- * Name: findDuplicates * Goal: Go through the list and find any duplicates * Add this duplicate information to the .cmdi file * History: * 10/feb/2016 ERK Created ------------------------------------------------------------------------------------- */ public bool findDuplicates(ref List<SubInstance> lSubInst, int iGoodHd, ref omdbapi objOmdb) { try { // Need to have a hash analyzer object util.SimHashAnalyzer oSim = new util.SimHashAnalyzer(); // Walk through the list of instances for (int i=0;i<lSubInst.Count;i++) { // Walk through all other instances that could match up with me for (int j=0;j< i;j++) { // Check if the idmove or the imdbid is similar between <i,j> if (lSubInst[i].sIdMovie == lSubInst[j].sIdMovie || lSubInst[i].sImdbId == lSubInst[j].sImdbId) { // Get the hamming distance between items <i,j> int iHdist = oSim.GetHammingDistance(lSubInst[i].simhash, lSubInst[j].simhash); // Is this on or below the threshold? if (iHdist <= iGoodHd) { // So this is probably a duplicate of me -- add it lSubInst[i].addDuplicate(j); // DO NOT add me to the list of the other one --> EXTINCT // lSubInst[j].addDuplicate(i); // DO add me to the list of the other one (otherwise order of occurrance plays a role) lSubInst[j].addDuplicate(i); } } } } // Walk through all the instances again for (int i = 0; i < lSubInst.Count; i++) { String sTargetDir = sDirRoot; // Directory where we will store the result // Get this instance SubInstance oOrg = lSubInst[i]; // Initialisations String sLink = ""; List<String> lSimilar = new List<string>(); List<String> lEvid = new List<string>(); // Does this one have duplicates? if (oOrg.lDup.Count>0) { // Find the first longest text both in words and sentences int iWords = oOrg.words; int iSents = oOrg.sents; int iLongest = -1; bool bEqual = true; // Assume copies are NOT equal... for (int j=0;j<oOrg.lDup.Count;j++) { SubInstance oThis = lSubInst[oOrg.lDup[j]]; if (oThis.words> iWords && oThis.sents >= iSents) { // Adapt the new maximum iWords = oThis.words; iSents = oThis.sents; iLongest = j; } // Check for inequality if (oThis.words != iWords || oThis.sents != iSents) bEqual = false; } // Check what the longest is; that's the most 'original' if (iLongest<0) { // Are they equal? if (bEqual) { // The [oOrg] is the longest oOrg.license = "equal"; sLink = "list"; } else { // The [oOrg] is the longest oOrg.license = "largest"; sLink = "this"; } // Create a list of similar ones for (int j = 0; j < oOrg.lDup.Count; j++) { // aSimilar.Add(lSubInst[oOrg.lDup[j]].name); lSimilar.Add(lSubInst[oOrg.lDup[j]].name); } } else { // Another one is the longest oOrg.license = "copy"; sLink = lSubInst[iLongest].name; } } else { // This is a unique subtitle file oOrg.license = "unique"; sLink = "none"; } // Adapt the .cmdi.xml file for this item String sFileCmdi = oOrg.file.Replace(".folia.xml", ".cmdi.xml"); // Do we need to continue? if (!File.Exists(sFileCmdi)) { // If the CMDI file does not exist, we cannot adapt it // It is not really an error, but we should give a warning errHandle.Status("findDuplicates: skipping non-existent [" + sFileCmdi + "]"); return true; } // Read the CMDI XmlDocument pdxCmdi = new XmlDocument(); pdxCmdi.Load(sFileCmdi); oTools.SetXmlDocument(pdxCmdi, CLARIN_CMDI); // Get correct namespace manager XmlNamespaceManager nsFolia = new XmlNamespaceManager(pdxCmdi.NameTable); nsFolia.AddNamespace("f", pdxCmdi.DocumentElement.NamespaceURI); // Zoek het <Subtitle> element XmlNode ndxSubtitle = pdxCmdi.SelectSingleNode("./descendant::f:Subtitle", nsFolia); if (ndxSubtitle != null) { // Get the movie id XmlNode ndxMovieId = pdxCmdi.SelectSingleNode("./descendant::f:MovieId", nsFolia); String sIdMovie = ndxMovieId.InnerText; // Get the list of languages String sSubLangs = ""; if (!getSubtitleLanguages(sIdMovie, ref sSubLangs)) return false; // Remove any available list XmlNode ndxAvailableList = ndxSubtitle.SelectSingleNode("./child::f:AvailableList", nsFolia); if (ndxAvailableList != null) { ndxAvailableList.RemoveAll(); ndxSubtitle.RemoveChild(ndxAvailableList); XmlNode ndxTmp = ndxSubtitle.SelectSingleNode("./child::f:languageAvailable", nsFolia); if (ndxTmp != null) { ndxTmp.RemoveAll(); ndxSubtitle.RemoveChild(ndxTmp); } } // Process the languages available list: XmlNode ndxLngAvail = ndxSubtitle.SelectSingleNode("./child::f:languageAvailable", nsFolia); if (ndxLngAvail == null) { ndxLngAvail = oTools.AddXmlChild(ndxSubtitle, "languageAvailable", "", sSubLangs, "text"); } else { // Check if this string is already in there if (!ndxLngAvail.InnerText.Contains(sSubLangs)) { // Add the string ndxLngAvail.InnerText = ndxLngAvail.InnerText + " " + sSubLangs; } } // Check if there is a statusinfo child XmlNode ndxStatusInfo = ndxSubtitle.SelectSingleNode("./child::f:StatusInfo", nsFolia); if (ndxStatusInfo == null) { // Create such a child ndxStatusInfo = oTools.AddXmlChild(ndxSubtitle, "StatusInfo", "status", "", "attribute", "link", "", "attribute"); } else { // Get any status info evidence there is XmlNode ndxEvid = ndxStatusInfo.SelectSingleNode("./child::f:Evidence", nsFolia); while (ndxEvid != null) { lEvid.Add(ndxEvid.InnerText); ndxEvid = ndxEvid.SelectSingleNode("./following-sibling::f:Evidence", nsFolia); } } // Add the information into the status info node ndxStatusInfo.Attributes["status"].Value = oOrg.license; ndxStatusInfo.Attributes["link"].Value = sLink; // Remove any previous links that might be still in here XmlNode ndxWork = ndxStatusInfo.SelectSingleNode("./child::f:Similar", nsFolia); while (ndxWork != null) { XmlNode ndxRemove = ndxWork; ndxWork = ndxWork.SelectSingleNode("./following-sibling::f:Similar", nsFolia); // Remove worknode contents ndxRemove.RemoveAll(); // Remove the worknode itself ndxStatusInfo.RemoveChild(ndxRemove); } // Add any links for (int j=0;j<lSimilar.Count;j++) { XmlNode ndxSimi = oTools.AddXmlChild(ndxStatusInfo, "Similar", "SimilarId", Convert.ToString(j+1), "attribute"); ndxSimi.InnerText = Path.GetFileNameWithoutExtension( lSimilar[j]); } // Try to find out more information, depending on the status we have found switch (oOrg.license) { case "copy": case "largest": case "equal": case "unique": bool bCopyright = false; bool bTranslated = false; bool bDownload = false; String sDetails = ""; // Walk through all evidence for (int j=0; j<lEvid.Count;j++) { // Get this evidence String sEvid = lEvid[j].ToLower(); // Check for RIP information // "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*|*ripped*|*download*|*copyright*" if (util.General.DoLike(sEvid, "*ripped*|*copyright*")) { // Add the copyright information as evidence if it is necessary if (bTranslated || bDownload || !bCopyright) sDetails = sEvid; bCopyright = true; } else if (util.General.DoLike(sEvid, "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*")) { // Double check the text of the evidence if (util.General.DoLike(sEvid, "*broadcast text*|*bti *")) { // This is 'stolen' from BTI or its predecessor bCopyright = true; sEvid = "BTI: " + sEvid; } else { bTranslated = true; } if (sDetails == "" || bDownload) sDetails = sEvid; } else if (util.General.DoLike(sEvid, "*download*")) { bDownload = true; if (sDetails == "") sDetails = sEvid; } } // Additional check XmlNode ndxUserClass = ndxSubtitle.SelectSingleNode("./descendant::f:Author/child::f:UserClass", nsFolia); if (ndxUserClass != null) { String sUserClass = ndxUserClass.InnerText.ToLower(); if (sUserClass == "subtranslator") { bTranslated = true; sDetails = "userclass=SubTranslator"; } } // We should be able to determine the license information String sLicense = ""; if (bCopyright) sLicense = "copyright"; else if (bTranslated) sLicense = "translation"; else if (bDownload) sLicense = "download"; else sLicense = "unknown"; // Find the location where we are going to put this information XmlNode ndxLicenseType = ndxSubtitle.SelectSingleNode("./descendant::f:LicenseType", nsFolia); ndxLicenseType.InnerText = sLicense; XmlNode ndxLicenseDetails = ndxSubtitle.SelectSingleNode("./descendant::f:LicenseDetails", nsFolia); ndxLicenseDetails.InnerText = sDetails; // Adapt the target directory XmlNode ndxYear = pdxCmdi.SelectSingleNode("./descendant::f:Year", nsFolia); XmlNode ndxImdbId = pdxCmdi.SelectSingleNode("./descendant::f:ImdbId", nsFolia); String sYear = ndxYear.InnerText; String sImdbId = ndxImdbId.InnerText; // Determine the target directory... // WAS: sTargetDir += oOrg.license + "/" + sLicense + "/"; if (sYear != "") sTargetDir += sYear + "/"; else sTargetDir += "unknown/"; if (sImdbId != "") sTargetDir += sImdbId + "/"; break; default: // No further license determination is needed, since this is a copy break; } } // Zoek het <Movie> element XmlNode ndxMovie = pdxCmdi.SelectSingleNode("./descendant::f:Movie", nsFolia); if (ndxMovie != null) { // Movie information needs to be gathered *ALWAYS* XmlNode ndxImdbId = ndxMovie.SelectSingleNode("./child::f:ImdbId", nsFolia); String sImdbId = ndxImdbId.InnerText; // Get the movie information MovieInfo oInfo = objOmdb.getInfo(sImdbId); if (oInfo == null) { // Not sure what to do now int iError = 1; errHandle.Status("findDuplicates: could not get information for imdb="+sImdbId); } else { // (1) Add the runtime information if (!addOneInfo(ndxMovie, nsFolia, "Runtime", oInfo.runtime)) return false; // (2) Add the COUNTRY information if (!addMultiInfo(ndxMovie, nsFolia, "Country", oInfo.country)) return false; // (3) Add the GENRE information if (!addMultiInfo(ndxMovie, nsFolia, "Genre", oInfo.genre)) return false; // (4) Add the LANGUAGE information if (!addMultiInfo(ndxMovie, nsFolia, "Language", oInfo.language)) return false; // (5) Add the DIRECTOR information if (!addMultiInfo(ndxMovie, nsFolia, "Director", oInfo.director)) return false; // (6) Add the WRITER information if (!addMultiInfo(ndxMovie, nsFolia, "Writer", oInfo.writer)) return false; // (7) Add the ACTOR information if (!addMultiInfo(ndxMovie, nsFolia, "Actor", oInfo.actors)) return false; // (8) Add other information: rated, released, plot, awards, imdbRating, imdbVotes if (!addOneInfo(ndxMovie, nsFolia, "Rated", oInfo.rated)) return false; if (!addOneInfo(ndxMovie, nsFolia, "Released", oInfo.released)) return false; if (!addOneInfo(ndxMovie, nsFolia, "Plot", oInfo.plot)) return false; if (!addOneInfo(ndxMovie, nsFolia, "Awards", oInfo.awards)) return false; if (!addOneInfo(ndxMovie, nsFolia, "imdbRating", oInfo.imdbRating)) return false; if (!addOneInfo(ndxMovie, nsFolia, "imdbVotes", oInfo.imdbVotes.Replace(",", ""))) return false; // (9) Look for the <Series>... XmlNode ndxSeries = pdxCmdi.SelectSingleNode("./descendant::f:Series", nsFolia); if (ndxSeries != null) { // Get the nodes we are interested in XmlNode ndxSeason = ndxSeries.SelectSingleNode("./child::f:Season", nsFolia); XmlNode ndxEpisode = ndxSeries.SelectSingleNode("./child::f:Episode", nsFolia); XmlNode ndxParent = ndxSeries.SelectSingleNode("./child::f:ParentImdbId", nsFolia); MovieInfo oParent = null; if (ndxParent != null && ndxParent.InnerText != "") { String sParentImdbId = ndxParent.InnerText; oParent = objOmdb.getInfo(sParentImdbId); } // Add the season/episode information if (ndxSeason != null && ndxEpisode != null) { if (oParent== null) { oTools.AddAttribute(ndxSeason, "Name", ""); oTools.AddAttribute(ndxEpisode, "Name", ""); } else { oTools.AddAttribute(ndxSeason, "Name", ""); oTools.AddAttribute(ndxEpisode, "Name", ""); } } } } } // Save the adapted CMDI pdxCmdi.Save(sFileCmdi); // Create the target directory if it does not exist yet if (!Directory.Exists(sTargetDir)) { Directory.CreateDirectory(sTargetDir); } // Get the file name String sName = Path.GetFileNameWithoutExtension(sFileCmdi).Replace(".cmdi", ""); String sSrc = Path.GetDirectoryName(sFileCmdi); if (!sSrc.EndsWith("/") && !sSrc.EndsWith("\\")) sSrc += "/"; // Copy the CMDI File.Copy(sFileCmdi, sTargetDir + sName + ".cmdi.xml", true); // Copy the folia sName = sName + ".folia.xml.gz"; File.Copy(sSrc + sName, sTargetDir + sName, true); // Show where we are errHandle.Status("copying:\t" + oOrg.name + "\t" + oOrg.license + "\t" + sLink + "\t" + sTargetDir); } // Return positively return true; } catch (Exception ex) { errHandle.DoError("oprConv/findDuplicates", ex); return false; } }
/* ------------------------------------------------------------------------------------- * Name: getDistanceOview * Goal: Get an overview of the distances between files * History: * 8/feb/2016 ERK Created ------------------------------------------------------------------------------------- */ public String getDistanceOview() { String sBack = ""; StringBuilder sbThis = new StringBuilder(); try { util.SimHashAnalyzer oSim = new util.SimHashAnalyzer(); // Compare all similarity hashes with one another for (int i=0;i<lstSimHash.Count;i++) { for (int j=0;j< i;j++) { // Compare items <i,j> float fDist = oSim.GetLikenessValue(lstSimHash[i].iSimHash, lstSimHash[j].iSimHash); int iHdist = oSim.GetHammingDistance(lstSimHash[i].iSimHash, lstSimHash[j].iSimHash); sbThis.AppendLine( fDist + "\t" + iHdist + "\t" + lstSimHash[i].sFile + "\t" + lstSimHash[j].sFile + "\tfc " + lstSimHash[i].sFile + " " + lstSimHash[j].sFile); } } // Combine sBack = sbThis.ToString(); // Return the result return sBack; } catch (Exception ex) { errHandle.DoError("oprConv/getDistanceOview", ex); return ""; } }
// ------------------------------------------------------------------------------------------------------------ // Name: getXmlStats // Goal: Get all the text from the <s><t> nodes and compute a hash code from it // Also compute the number of words and sentences // History: // 04-02-2016 ERK Created // ------------------------------------------------------------------------------------------------------------ public bool getXmlStats(String sFileIn, ref String sSimHash, List<String> lStat, ref int iWords, ref int iSents) { String sMethod = "simhash"; try { // Initialise iWords = 0; iSents = 0; bool bUseNext = false; // We need to have our own XmlDocument ready XmlDocument pdxLocal = new XmlDocument(); // Prepare a string reader to read all we need StringBuilder sbThis = new StringBuilder(); // Alsoo prepare a string reader to store potential StatusInfo StringBuilder sbStat = new StringBuilder(); // Create an XmlReader to get to the <s><t> nodes... using (StreamReader rdFileTmp = new StreamReader(sFileIn)) using (XmlReader rdFolia = XmlReader.Create(rdFileTmp)) { // (1) Walk through the bare folia input file while (!rdFolia.EOF && rdFolia.Read()) { // (2) Check the input element if (rdFolia.IsStartElement("t")) { // It needs to have an attribute [class] if (rdFolia.HasAttributes) { // Get the @class attribute String sClass = rdFolia.GetAttribute("class"); // Check the value if (sClass == "nld" || sClass == "nl") { // Correct attribute: read the node String sContent = rdFolia.ReadInnerXml(); String sLine = sContent + "\n"; sbThis.Append(sLine); // Check for StatusInfo if (bUseNext) { sbStat.Append(" // "+ sContent ); // Add to the list of statusinfo evidence lStat.Add(sbStat.ToString()); sbStat.Clear(); bUseNext = false; } else if (General.DoLike(sLine.ToLower(), "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*|*ripped*|*download*|*copyright*")) { // Is this the first one? if (sbStat.Length > 0) sbStat.Append("\n"); sbStat.Append(sContent); bUseNext = true; } } } } else if (rdFolia.IsStartElement("w")) { // Get the @class attribute String sClass = rdFolia.GetAttribute("class"); if (sClass == "Vern") iWords += 1; } else if (rdFolia.IsStartElement("s")) { // Keep track of the number of sentences iSents += 1; } } } // Create one string from the whole String sTotal = sbThis.ToString(); // sStat = sbStat.ToString(); // =============== DEBUG =============== // Store the string into a text file File.WriteAllText(sFileIn + ".txt", sTotal, System.Text.Encoding.UTF8); // ===================================== switch (sMethod) { case "md5": // Method #1: compute the hash from this string var md5 = MD5.Create(); MemoryStream mStrm = new MemoryStream(Encoding.UTF8.GetBytes(sTotal)); byte[] hashBytes = md5.ComputeHash(mStrm); // Convert the byte array to a hash string sSimHash = ByteArrayToString(hashBytes); break; case "simhash": // Method #2: compute the simhash from this string SimHashAnalyzer oAna = new SimHashAnalyzer(); // errHandle.Status("input = [" + sTotal + "]"); UInt64 iSimHash = oAna.DoCalculateSimHash(sTotal); // Convert integer to string sSimHash = Convert.ToString(iSimHash); break; } return true; } catch (Exception ex) { // Warn the user errHandle.DoError("xmlTools/getXmlStats", ex); return false; } }