/* ------------------------------------------------------------------------------------- * Name: findDuplicates * Goal: Go through the list and find any duplicates * Add this duplicate information to the .cmdi file * History: * 10/feb/2016 ERK Created ------------------------------------------------------------------------------------- */ public bool findDuplicates(ref List<SubInstance> lSubInst, int iGoodHd, ref omdbapi objOmdb) { try { // Need to have a hash analyzer object util.SimHashAnalyzer oSim = new util.SimHashAnalyzer(); // Walk through the list of instances for (int i=0;i<lSubInst.Count;i++) { // Walk through all other instances that could match up with me for (int j=0;j< i;j++) { // Check if the idmove or the imdbid is similar between <i,j> if (lSubInst[i].sIdMovie == lSubInst[j].sIdMovie || lSubInst[i].sImdbId == lSubInst[j].sImdbId) { // Get the hamming distance between items <i,j> int iHdist = oSim.GetHammingDistance(lSubInst[i].simhash, lSubInst[j].simhash); // Is this on or below the threshold? if (iHdist <= iGoodHd) { // So this is probably a duplicate of me -- add it lSubInst[i].addDuplicate(j); // DO NOT add me to the list of the other one --> EXTINCT // lSubInst[j].addDuplicate(i); // DO add me to the list of the other one (otherwise order of occurrance plays a role) lSubInst[j].addDuplicate(i); } } } } // Walk through all the instances again for (int i = 0; i < lSubInst.Count; i++) { String sTargetDir = sDirRoot; // Directory where we will store the result // Get this instance SubInstance oOrg = lSubInst[i]; // Initialisations String sLink = ""; List<String> lSimilar = new List<string>(); List<String> lEvid = new List<string>(); // Does this one have duplicates? if (oOrg.lDup.Count>0) { // Find the first longest text both in words and sentences int iWords = oOrg.words; int iSents = oOrg.sents; int iLongest = -1; bool bEqual = true; // Assume copies are NOT equal... for (int j=0;j<oOrg.lDup.Count;j++) { SubInstance oThis = lSubInst[oOrg.lDup[j]]; if (oThis.words> iWords && oThis.sents >= iSents) { // Adapt the new maximum iWords = oThis.words; iSents = oThis.sents; iLongest = j; } // Check for inequality if (oThis.words != iWords || oThis.sents != iSents) bEqual = false; } // Check what the longest is; that's the most 'original' if (iLongest<0) { // Are they equal? if (bEqual) { // The [oOrg] is the longest oOrg.license = "equal"; sLink = "list"; } else { // The [oOrg] is the longest oOrg.license = "largest"; sLink = "this"; } // Create a list of similar ones for (int j = 0; j < oOrg.lDup.Count; j++) { // aSimilar.Add(lSubInst[oOrg.lDup[j]].name); lSimilar.Add(lSubInst[oOrg.lDup[j]].name); } } else { // Another one is the longest oOrg.license = "copy"; sLink = lSubInst[iLongest].name; } } else { // This is a unique subtitle file oOrg.license = "unique"; sLink = "none"; } // Adapt the .cmdi.xml file for this item String sFileCmdi = oOrg.file.Replace(".folia.xml", ".cmdi.xml"); // Do we need to continue? if (!File.Exists(sFileCmdi)) { // If the CMDI file does not exist, we cannot adapt it // It is not really an error, but we should give a warning errHandle.Status("findDuplicates: skipping non-existent [" + sFileCmdi + "]"); return true; } // Read the CMDI XmlDocument pdxCmdi = new XmlDocument(); pdxCmdi.Load(sFileCmdi); oTools.SetXmlDocument(pdxCmdi, CLARIN_CMDI); // Get correct namespace manager XmlNamespaceManager nsFolia = new XmlNamespaceManager(pdxCmdi.NameTable); nsFolia.AddNamespace("f", pdxCmdi.DocumentElement.NamespaceURI); // Zoek het <Subtitle> element XmlNode ndxSubtitle = pdxCmdi.SelectSingleNode("./descendant::f:Subtitle", nsFolia); if (ndxSubtitle != null) { // Get the movie id XmlNode ndxMovieId = pdxCmdi.SelectSingleNode("./descendant::f:MovieId", nsFolia); String sIdMovie = ndxMovieId.InnerText; // Get the list of languages String sSubLangs = ""; if (!getSubtitleLanguages(sIdMovie, ref sSubLangs)) return false; // Remove any available list XmlNode ndxAvailableList = ndxSubtitle.SelectSingleNode("./child::f:AvailableList", nsFolia); if (ndxAvailableList != null) { ndxAvailableList.RemoveAll(); ndxSubtitle.RemoveChild(ndxAvailableList); XmlNode ndxTmp = ndxSubtitle.SelectSingleNode("./child::f:languageAvailable", nsFolia); if (ndxTmp != null) { ndxTmp.RemoveAll(); ndxSubtitle.RemoveChild(ndxTmp); } } // Process the languages available list: XmlNode ndxLngAvail = ndxSubtitle.SelectSingleNode("./child::f:languageAvailable", nsFolia); if (ndxLngAvail == null) { ndxLngAvail = oTools.AddXmlChild(ndxSubtitle, "languageAvailable", "", sSubLangs, "text"); } else { // Check if this string is already in there if (!ndxLngAvail.InnerText.Contains(sSubLangs)) { // Add the string ndxLngAvail.InnerText = ndxLngAvail.InnerText + " " + sSubLangs; } } // Check if there is a statusinfo child XmlNode ndxStatusInfo = ndxSubtitle.SelectSingleNode("./child::f:StatusInfo", nsFolia); if (ndxStatusInfo == null) { // Create such a child ndxStatusInfo = oTools.AddXmlChild(ndxSubtitle, "StatusInfo", "status", "", "attribute", "link", "", "attribute"); } else { // Get any status info evidence there is XmlNode ndxEvid = ndxStatusInfo.SelectSingleNode("./child::f:Evidence", nsFolia); while (ndxEvid != null) { lEvid.Add(ndxEvid.InnerText); ndxEvid = ndxEvid.SelectSingleNode("./following-sibling::f:Evidence", nsFolia); } } // Add the information into the status info node ndxStatusInfo.Attributes["status"].Value = oOrg.license; ndxStatusInfo.Attributes["link"].Value = sLink; // Remove any previous links that might be still in here XmlNode ndxWork = ndxStatusInfo.SelectSingleNode("./child::f:Similar", nsFolia); while (ndxWork != null) { XmlNode ndxRemove = ndxWork; ndxWork = ndxWork.SelectSingleNode("./following-sibling::f:Similar", nsFolia); // Remove worknode contents ndxRemove.RemoveAll(); // Remove the worknode itself ndxStatusInfo.RemoveChild(ndxRemove); } // Add any links for (int j=0;j<lSimilar.Count;j++) { XmlNode ndxSimi = oTools.AddXmlChild(ndxStatusInfo, "Similar", "SimilarId", Convert.ToString(j+1), "attribute"); ndxSimi.InnerText = Path.GetFileNameWithoutExtension( lSimilar[j]); } // Try to find out more information, depending on the status we have found switch (oOrg.license) { case "copy": case "largest": case "equal": case "unique": bool bCopyright = false; bool bTranslated = false; bool bDownload = false; String sDetails = ""; // Walk through all evidence for (int j=0; j<lEvid.Count;j++) { // Get this evidence String sEvid = lEvid[j].ToLower(); // Check for RIP information // "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*|*ripped*|*download*|*copyright*" if (util.General.DoLike(sEvid, "*ripped*|*copyright*")) { // Add the copyright information as evidence if it is necessary if (bTranslated || bDownload || !bCopyright) sDetails = sEvid; bCopyright = true; } else if (util.General.DoLike(sEvid, "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*")) { // Double check the text of the evidence if (util.General.DoLike(sEvid, "*broadcast text*|*bti *")) { // This is 'stolen' from BTI or its predecessor bCopyright = true; sEvid = "BTI: " + sEvid; } else { bTranslated = true; } if (sDetails == "" || bDownload) sDetails = sEvid; } else if (util.General.DoLike(sEvid, "*download*")) { bDownload = true; if (sDetails == "") sDetails = sEvid; } } // Additional check XmlNode ndxUserClass = ndxSubtitle.SelectSingleNode("./descendant::f:Author/child::f:UserClass", nsFolia); if (ndxUserClass != null) { String sUserClass = ndxUserClass.InnerText.ToLower(); if (sUserClass == "subtranslator") { bTranslated = true; sDetails = "userclass=SubTranslator"; } } // We should be able to determine the license information String sLicense = ""; if (bCopyright) sLicense = "copyright"; else if (bTranslated) sLicense = "translation"; else if (bDownload) sLicense = "download"; else sLicense = "unknown"; // Find the location where we are going to put this information XmlNode ndxLicenseType = ndxSubtitle.SelectSingleNode("./descendant::f:LicenseType", nsFolia); ndxLicenseType.InnerText = sLicense; XmlNode ndxLicenseDetails = ndxSubtitle.SelectSingleNode("./descendant::f:LicenseDetails", nsFolia); ndxLicenseDetails.InnerText = sDetails; // Adapt the target directory XmlNode ndxYear = pdxCmdi.SelectSingleNode("./descendant::f:Year", nsFolia); XmlNode ndxImdbId = pdxCmdi.SelectSingleNode("./descendant::f:ImdbId", nsFolia); String sYear = ndxYear.InnerText; String sImdbId = ndxImdbId.InnerText; // Determine the target directory... // WAS: sTargetDir += oOrg.license + "/" + sLicense + "/"; if (sYear != "") sTargetDir += sYear + "/"; else sTargetDir += "unknown/"; if (sImdbId != "") sTargetDir += sImdbId + "/"; break; default: // No further license determination is needed, since this is a copy break; } } // Zoek het <Movie> element XmlNode ndxMovie = pdxCmdi.SelectSingleNode("./descendant::f:Movie", nsFolia); if (ndxMovie != null) { // Movie information needs to be gathered *ALWAYS* XmlNode ndxImdbId = ndxMovie.SelectSingleNode("./child::f:ImdbId", nsFolia); String sImdbId = ndxImdbId.InnerText; // Get the movie information MovieInfo oInfo = objOmdb.getInfo(sImdbId); if (oInfo == null) { // Not sure what to do now int iError = 1; errHandle.Status("findDuplicates: could not get information for imdb="+sImdbId); } else { // (1) Add the runtime information if (!addOneInfo(ndxMovie, nsFolia, "Runtime", oInfo.runtime)) return false; // (2) Add the COUNTRY information if (!addMultiInfo(ndxMovie, nsFolia, "Country", oInfo.country)) return false; // (3) Add the GENRE information if (!addMultiInfo(ndxMovie, nsFolia, "Genre", oInfo.genre)) return false; // (4) Add the LANGUAGE information if (!addMultiInfo(ndxMovie, nsFolia, "Language", oInfo.language)) return false; // (5) Add the DIRECTOR information if (!addMultiInfo(ndxMovie, nsFolia, "Director", oInfo.director)) return false; // (6) Add the WRITER information if (!addMultiInfo(ndxMovie, nsFolia, "Writer", oInfo.writer)) return false; // (7) Add the ACTOR information if (!addMultiInfo(ndxMovie, nsFolia, "Actor", oInfo.actors)) return false; // (8) Add other information: rated, released, plot, awards, imdbRating, imdbVotes if (!addOneInfo(ndxMovie, nsFolia, "Rated", oInfo.rated)) return false; if (!addOneInfo(ndxMovie, nsFolia, "Released", oInfo.released)) return false; if (!addOneInfo(ndxMovie, nsFolia, "Plot", oInfo.plot)) return false; if (!addOneInfo(ndxMovie, nsFolia, "Awards", oInfo.awards)) return false; if (!addOneInfo(ndxMovie, nsFolia, "imdbRating", oInfo.imdbRating)) return false; if (!addOneInfo(ndxMovie, nsFolia, "imdbVotes", oInfo.imdbVotes.Replace(",", ""))) return false; // (9) Look for the <Series>... XmlNode ndxSeries = pdxCmdi.SelectSingleNode("./descendant::f:Series", nsFolia); if (ndxSeries != null) { // Get the nodes we are interested in XmlNode ndxSeason = ndxSeries.SelectSingleNode("./child::f:Season", nsFolia); XmlNode ndxEpisode = ndxSeries.SelectSingleNode("./child::f:Episode", nsFolia); XmlNode ndxParent = ndxSeries.SelectSingleNode("./child::f:ParentImdbId", nsFolia); MovieInfo oParent = null; if (ndxParent != null && ndxParent.InnerText != "") { String sParentImdbId = ndxParent.InnerText; oParent = objOmdb.getInfo(sParentImdbId); } // Add the season/episode information if (ndxSeason != null && ndxEpisode != null) { if (oParent== null) { oTools.AddAttribute(ndxSeason, "Name", ""); oTools.AddAttribute(ndxEpisode, "Name", ""); } else { oTools.AddAttribute(ndxSeason, "Name", ""); oTools.AddAttribute(ndxEpisode, "Name", ""); } } } } } // Save the adapted CMDI pdxCmdi.Save(sFileCmdi); // Create the target directory if it does not exist yet if (!Directory.Exists(sTargetDir)) { Directory.CreateDirectory(sTargetDir); } // Get the file name String sName = Path.GetFileNameWithoutExtension(sFileCmdi).Replace(".cmdi", ""); String sSrc = Path.GetDirectoryName(sFileCmdi); if (!sSrc.EndsWith("/") && !sSrc.EndsWith("\\")) sSrc += "/"; // Copy the CMDI File.Copy(sFileCmdi, sTargetDir + sName + ".cmdi.xml", true); // Copy the folia sName = sName + ".folia.xml.gz"; File.Copy(sSrc + sName, sTargetDir + sName, true); // Show where we are errHandle.Status("copying:\t" + oOrg.name + "\t" + oOrg.license + "\t" + sLink + "\t" + sTargetDir); } // Return positively return true; } catch (Exception ex) { errHandle.DoError("oprConv/findDuplicates", ex); return false; } }
// Command-line entry point + argument handling static void Main(string[] args) { String sInput = ""; // Input file or dir String sOutput = "/scratch/ekomen/out/"; // Output directory, if specified String sLanguage = "dut"; // This is the language abbreviation used in [osrMovie.cs] for sBaseUrl String sDict = ""; // Movie dictionary bool bIsDebug = false; // Debugging bool bForce = false; // Force bool bOview = false; // Make overview or not bool bSkip = false; // Skip everything that has *not* been made String sAction = "cmdi"; // Type of action to be taken try { // Check command-line options for (int i = 0; i < args.Length; i++) { // get this argument String sArg = args[i].Trim(); if (sArg.StartsWith("-")) { errHandle.Status("Processing argument ["+sArg+"]"); // Check out the arguments switch (sArg.Substring(1)) { case "i": // Input file or directory with .folia.xml files sInput = args[++i]; break; case "f": // Force bForce = true; break; case "s": // Skip bSkip = true; break; case "m": // Movie dictionary -- Tab-separated list from opensubtitles.org sDict = args[++i]; break; case "o": // Output directory sOutput = args[++i]; break; case "h": // Calculate hashes and add them to existing .cmdi.xml files sAction = "hash"; break; case "v": // Make an overview bOview = true; break; case "d": // Debugging bIsDebug = true; break; case "l": // Language (three letter code) sLanguage = args[++i]; break; } } else if (sArg == "" || sArg == "\r") { // Do nothing } else { // Throw syntax error and leave SyntaxError("1 - i=" + i + " args=" + args.Length + " argCurrent=[" + sArg + "]"); return; } } // Check presence of input/output if (sInput == "" ) { SyntaxError("2"); return; } // Initialize the main entry point for the conversion oprConv objConv = new oprConv(errHandle); osrMovie objMovie = new osrMovie(errHandle, sLanguage); omdbapi objOmdb = new omdbapi(errHandle); // Set directory for conversion objConv.dirRoot(sOutput); // Load the movie dictionary if (!objConv.loadMovieDictionary(sDict)) { errHandle.DoError("Main", "Could not load movie dictionary from [" + sDict + "]"); return; } // Initialise the Treebank Xpath functions, which may make use of tb:matches() util.XPathFunctions.conTb.AddNamespace("tb", util.XPathFunctions.TREEBANK_EXTENSIONS); // Check if the input is a directory or file if (Directory.Exists(sInput)) { WalkDirectoryTree(sInput, "*.folia.xml.gz", sInput, bForce, bSkip, bIsDebug, sAction, ref objConv, ref objMovie); } else { // Show we don't have input file errHandle.DoError("Main", "Cannot find input file(s) in: " + sInput); } // Calculate for each file which others are close to it // - try to determine the license information for the best matching .cmdi.xml files // - add some more meta-information to the .cmdi.xml files objConv.findDuplicates(ref lSubInst, 3, ref objOmdb); // Create an overview - if required if (bOview) { String sOview = objConv.getDistanceOview(); // Save it in a standard file String sFileCsv = Path.GetDirectoryName(sInput) + "/oview.csv"; File.WriteAllText(sFileCsv, sOview); } // Exit the program Console.WriteLine("Ready"); } catch (Exception ex) { errHandle.DoError("Main", ex); // Provide standard error message throw; } }