Beispiel #1
0
        /* -------------------------------------------------------------------------------------
          * Name:        findDuplicates
          * Goal:        Go through the list and find any duplicates
          *              Add this duplicate information to the .cmdi file
          * History:
          * 10/feb/2016 ERK Created
        ------------------------------------------------------------------------------------- */
        public bool findDuplicates(ref List<SubInstance> lSubInst, int iGoodHd, ref omdbapi objOmdb)
        {
            try {
            // Need to have a hash analyzer object
            util.SimHashAnalyzer oSim = new util.SimHashAnalyzer();
            // Walk through the list of instances
            for (int i=0;i<lSubInst.Count;i++) {
              // Walk through all other instances that could match up with me
              for (int j=0;j< i;j++) {
            // Check if the idmove or the imdbid is similar between <i,j>
            if (lSubInst[i].sIdMovie == lSubInst[j].sIdMovie ||
                lSubInst[i].sImdbId == lSubInst[j].sImdbId) {
              // Get the hamming distance between items <i,j>
              int iHdist = oSim.GetHammingDistance(lSubInst[i].simhash, lSubInst[j].simhash);
              // Is this on or below the threshold?
              if (iHdist <= iGoodHd) {
                // So this is probably a duplicate of me -- add it
                lSubInst[i].addDuplicate(j);
                // DO NOT add me to the list of the other one --> EXTINCT
                // lSubInst[j].addDuplicate(i);
                // DO add me to the list of the other one (otherwise order of occurrance plays a role)
                lSubInst[j].addDuplicate(i);
              }
            }
              }
            }

            // Walk through all the instances again
            for (int i = 0; i < lSubInst.Count; i++) {
              String sTargetDir = sDirRoot;   // Directory where we will store the result

              // Get this instance
              SubInstance oOrg = lSubInst[i];

              // Initialisations
              String sLink = "";
              List<String> lSimilar = new List<string>();
              List<String> lEvid = new List<string>();

              // Does this one have duplicates?
              if (oOrg.lDup.Count>0) {
            // Find the first longest text both in words and sentences
            int iWords = oOrg.words;
            int iSents = oOrg.sents;
            int iLongest = -1;
            bool bEqual = true;    // Assume copies are NOT equal...
            for (int j=0;j<oOrg.lDup.Count;j++) {
              SubInstance oThis = lSubInst[oOrg.lDup[j]];
              if (oThis.words> iWords && oThis.sents >= iSents) {
                // Adapt the new maximum
                iWords = oThis.words;
                iSents = oThis.sents;
                iLongest = j;
              }
              // Check for inequality
              if (oThis.words != iWords || oThis.sents != iSents) bEqual = false;
            }
            // Check what the longest is; that's the most 'original'
            if (iLongest<0) {
              // Are they equal?
              if (bEqual) {
                // The [oOrg] is the longest
                oOrg.license = "equal";
                sLink = "list";
              } else {
                // The [oOrg] is the longest
                oOrg.license = "largest";
                sLink = "this";
              }
              // Create a list of similar ones
              for (int j = 0; j < oOrg.lDup.Count; j++) {
                // aSimilar.Add(lSubInst[oOrg.lDup[j]].name);
                lSimilar.Add(lSubInst[oOrg.lDup[j]].name);
              }
            } else {
              // Another one is the longest
              oOrg.license = "copy";
              sLink = lSubInst[iLongest].name;
            }
              } else {
            // This is a unique subtitle file
            oOrg.license = "unique";
            sLink = "none";
              }
              // Adapt the .cmdi.xml file for this item
              String sFileCmdi = oOrg.file.Replace(".folia.xml", ".cmdi.xml");
              // Do we need to continue?
              if (!File.Exists(sFileCmdi)) {
            // If the CMDI file does not exist, we cannot adapt it
            // It is not really an error, but we should give a warning
            errHandle.Status("findDuplicates: skipping non-existent [" + sFileCmdi + "]");
            return true;
              }
              // Read the CMDI
              XmlDocument pdxCmdi = new XmlDocument();
              pdxCmdi.Load(sFileCmdi);
              oTools.SetXmlDocument(pdxCmdi, CLARIN_CMDI);
              // Get correct namespace manager
              XmlNamespaceManager nsFolia = new XmlNamespaceManager(pdxCmdi.NameTable);
              nsFolia.AddNamespace("f", pdxCmdi.DocumentElement.NamespaceURI);
              // Zoek het <Subtitle> element
              XmlNode ndxSubtitle = pdxCmdi.SelectSingleNode("./descendant::f:Subtitle", nsFolia);
              if (ndxSubtitle != null) {
            // Get the movie id
            XmlNode ndxMovieId = pdxCmdi.SelectSingleNode("./descendant::f:MovieId", nsFolia);
            String sIdMovie = ndxMovieId.InnerText;
            // Get the list of languages
            String sSubLangs = "";
            if (!getSubtitleLanguages(sIdMovie, ref sSubLangs)) return false;
            // Remove any available list
            XmlNode ndxAvailableList = ndxSubtitle.SelectSingleNode("./child::f:AvailableList", nsFolia);
            if (ndxAvailableList != null) {
              ndxAvailableList.RemoveAll();
              ndxSubtitle.RemoveChild(ndxAvailableList);
              XmlNode ndxTmp = ndxSubtitle.SelectSingleNode("./child::f:languageAvailable", nsFolia);
              if (ndxTmp != null) {
                ndxTmp.RemoveAll();
                ndxSubtitle.RemoveChild(ndxTmp);
              }
            }

            // Process the languages available list:
            XmlNode ndxLngAvail = ndxSubtitle.SelectSingleNode("./child::f:languageAvailable", nsFolia);
            if (ndxLngAvail == null) {
              ndxLngAvail = oTools.AddXmlChild(ndxSubtitle, "languageAvailable", "", sSubLangs, "text");
            } else {
              // Check if this string is already in there
              if (!ndxLngAvail.InnerText.Contains(sSubLangs)) {
                // Add the string
                ndxLngAvail.InnerText = ndxLngAvail.InnerText + " " + sSubLangs;
              }
            }
            // Check if there is a statusinfo child
            XmlNode ndxStatusInfo = ndxSubtitle.SelectSingleNode("./child::f:StatusInfo", nsFolia);
            if (ndxStatusInfo == null) {
              // Create such a child
              ndxStatusInfo = oTools.AddXmlChild(ndxSubtitle, "StatusInfo",
                "status", "", "attribute",
                "link", "", "attribute");
            } else {
              // Get any status info evidence there is
              XmlNode ndxEvid = ndxStatusInfo.SelectSingleNode("./child::f:Evidence", nsFolia);
              while (ndxEvid != null) {
                lEvid.Add(ndxEvid.InnerText);
                ndxEvid = ndxEvid.SelectSingleNode("./following-sibling::f:Evidence", nsFolia);
              }
            }
            // Add the information into the status info node
            ndxStatusInfo.Attributes["status"].Value = oOrg.license;
            ndxStatusInfo.Attributes["link"].Value = sLink;
            // Remove any previous links that might be still in here
            XmlNode ndxWork = ndxStatusInfo.SelectSingleNode("./child::f:Similar", nsFolia);
            while (ndxWork != null) {
              XmlNode ndxRemove = ndxWork;
              ndxWork = ndxWork.SelectSingleNode("./following-sibling::f:Similar", nsFolia);
              // Remove worknode contents
              ndxRemove.RemoveAll();
              // Remove the worknode itself
              ndxStatusInfo.RemoveChild(ndxRemove);
            }
            // Add any links
            for (int j=0;j<lSimilar.Count;j++) {
              XmlNode ndxSimi = oTools.AddXmlChild(ndxStatusInfo, "Similar",
                "SimilarId", Convert.ToString(j+1), "attribute");
              ndxSimi.InnerText = Path.GetFileNameWithoutExtension( lSimilar[j]);
            }
            // Try to find out more information, depending on the status we have found
            switch (oOrg.license) {
              case "copy":
              case "largest":
              case "equal":
              case "unique":
                bool bCopyright = false;
                bool bTranslated = false;
                bool bDownload = false;
                String sDetails = "";
                // Walk through all evidence
                for (int j=0; j<lEvid.Count;j++) {
                  // Get this evidence
                  String sEvid = lEvid[j].ToLower();
                  // Check for RIP information
                  // "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*|*ripped*|*download*|*copyright*"
                  if (util.General.DoLike(sEvid, "*ripped*|*copyright*")) {
                    // Add the copyright information as evidence if it is necessary
                    if (bTranslated || bDownload || !bCopyright) sDetails = sEvid;
                    bCopyright = true;
                  } else if (util.General.DoLike(sEvid, "*vertaald*|*vertaling*|*ondertiteling*|*bewerkt*")) {
                    // Double check the text of the evidence
                    if (util.General.DoLike(sEvid, "*broadcast text*|*bti *")) {
                      // This is 'stolen' from BTI or its predecessor
                      bCopyright = true;
                      sEvid = "BTI: " + sEvid;
                    } else {
                      bTranslated = true;
                    }
                    if (sDetails == "" || bDownload) sDetails = sEvid;
                  } else if (util.General.DoLike(sEvid, "*download*")) {
                    bDownload = true;
                    if (sDetails == "") sDetails = sEvid;
                  }
                }
                // Additional check
                XmlNode ndxUserClass = ndxSubtitle.SelectSingleNode("./descendant::f:Author/child::f:UserClass", nsFolia);
                if (ndxUserClass != null) {
                  String sUserClass = ndxUserClass.InnerText.ToLower();
                  if (sUserClass == "subtranslator") {
                    bTranslated = true;
                    sDetails = "userclass=SubTranslator";
                  }
                }
                // We should be able to determine the license information
                String sLicense = "";
                if (bCopyright)
                  sLicense = "copyright";
                else if (bTranslated)
                  sLicense = "translation";
                else if (bDownload)
                  sLicense = "download";
                else
                  sLicense = "unknown";
                // Find the location where we are going to put this information
                XmlNode ndxLicenseType = ndxSubtitle.SelectSingleNode("./descendant::f:LicenseType", nsFolia);
                ndxLicenseType.InnerText = sLicense;
                XmlNode ndxLicenseDetails = ndxSubtitle.SelectSingleNode("./descendant::f:LicenseDetails", nsFolia);
                ndxLicenseDetails.InnerText = sDetails;
                // Adapt the target directory
                XmlNode ndxYear = pdxCmdi.SelectSingleNode("./descendant::f:Year", nsFolia);
                XmlNode ndxImdbId = pdxCmdi.SelectSingleNode("./descendant::f:ImdbId", nsFolia);
                String sYear = ndxYear.InnerText;
                String sImdbId = ndxImdbId.InnerText;
                // Determine the target directory...
                // WAS: sTargetDir += oOrg.license + "/" + sLicense + "/";
                if (sYear != "")
                  sTargetDir += sYear + "/";
                else
                  sTargetDir += "unknown/";
                if (sImdbId != "") sTargetDir += sImdbId + "/";
                break;
              default:
                // No further license determination is needed, since this is a copy
                break;
            }
              }
              // Zoek het <Movie> element
              XmlNode ndxMovie = pdxCmdi.SelectSingleNode("./descendant::f:Movie", nsFolia);
              if (ndxMovie != null) {
            // Movie information needs to be gathered *ALWAYS*
            XmlNode ndxImdbId = ndxMovie.SelectSingleNode("./child::f:ImdbId", nsFolia);
            String sImdbId = ndxImdbId.InnerText;
            // Get the movie information
            MovieInfo oInfo = objOmdb.getInfo(sImdbId);
            if (oInfo == null) {
              // Not sure what to do now
              int iError = 1;
              errHandle.Status("findDuplicates: could not get information for imdb="+sImdbId);
            } else {
              // (1) Add the runtime information
              if (!addOneInfo(ndxMovie, nsFolia, "Runtime", oInfo.runtime)) return false;
              // (2) Add the COUNTRY information
              if (!addMultiInfo(ndxMovie, nsFolia, "Country", oInfo.country)) return false;
              // (3) Add the GENRE information
              if (!addMultiInfo(ndxMovie, nsFolia, "Genre", oInfo.genre)) return false;
              // (4) Add the LANGUAGE information
              if (!addMultiInfo(ndxMovie, nsFolia, "Language", oInfo.language)) return false;
              // (5) Add the DIRECTOR information
              if (!addMultiInfo(ndxMovie, nsFolia, "Director", oInfo.director)) return false;
              // (6) Add the WRITER information
              if (!addMultiInfo(ndxMovie, nsFolia, "Writer", oInfo.writer)) return false;
              // (7) Add the ACTOR information
              if (!addMultiInfo(ndxMovie, nsFolia, "Actor", oInfo.actors)) return false;
              // (8) Add other information: rated, released, plot, awards, imdbRating, imdbVotes
              if (!addOneInfo(ndxMovie, nsFolia, "Rated", oInfo.rated)) return false;
              if (!addOneInfo(ndxMovie, nsFolia, "Released", oInfo.released)) return false;
              if (!addOneInfo(ndxMovie, nsFolia, "Plot", oInfo.plot)) return false;
              if (!addOneInfo(ndxMovie, nsFolia, "Awards", oInfo.awards)) return false;
              if (!addOneInfo(ndxMovie, nsFolia, "imdbRating", oInfo.imdbRating)) return false;
              if (!addOneInfo(ndxMovie, nsFolia, "imdbVotes", oInfo.imdbVotes.Replace(",", ""))) return false;
              // (9) Look for the <Series>...
              XmlNode ndxSeries = pdxCmdi.SelectSingleNode("./descendant::f:Series", nsFolia);
              if (ndxSeries != null) {
                // Get the nodes we are interested in
                XmlNode ndxSeason = ndxSeries.SelectSingleNode("./child::f:Season", nsFolia);
                XmlNode ndxEpisode = ndxSeries.SelectSingleNode("./child::f:Episode", nsFolia);
                XmlNode ndxParent = ndxSeries.SelectSingleNode("./child::f:ParentImdbId", nsFolia);
                MovieInfo oParent = null;
                if (ndxParent != null && ndxParent.InnerText != "") {
                  String sParentImdbId = ndxParent.InnerText;
                  oParent = objOmdb.getInfo(sParentImdbId);
                }
                // Add the season/episode information
                if (ndxSeason != null && ndxEpisode != null) {
                  if (oParent== null) {
                    oTools.AddAttribute(ndxSeason, "Name", "");
                    oTools.AddAttribute(ndxEpisode, "Name", "");
                  } else {
                    oTools.AddAttribute(ndxSeason, "Name", "");
                    oTools.AddAttribute(ndxEpisode, "Name", "");
                  }
                }
              }
            }
              }
              // Save the adapted CMDI
              pdxCmdi.Save(sFileCmdi);

              // Create the target directory if it does not exist yet
              if (!Directory.Exists(sTargetDir)) {
            Directory.CreateDirectory(sTargetDir);
              }
              // Get the file name
              String sName = Path.GetFileNameWithoutExtension(sFileCmdi).Replace(".cmdi", "");
              String sSrc = Path.GetDirectoryName(sFileCmdi);
              if (!sSrc.EndsWith("/") && !sSrc.EndsWith("\\")) sSrc += "/";
              // Copy the CMDI
              File.Copy(sFileCmdi, sTargetDir + sName + ".cmdi.xml", true);
              // Copy the folia
              sName = sName + ".folia.xml.gz";
              File.Copy(sSrc + sName, sTargetDir + sName, true);
              // Show where we are
              errHandle.Status("copying:\t" + oOrg.name + "\t" + oOrg.license + "\t" + sLink + "\t" + sTargetDir);

            }

            // Return positively
            return true;
              } catch (Exception ex) {
            errHandle.DoError("oprConv/findDuplicates", ex);
            return false;
              }
        }
Beispiel #2
0
        // Command-line entry point + argument handling
        static void Main(string[] args)
        {
            String sInput = "";       // Input file or dir
              String sOutput = "/scratch/ekomen/out/";      // Output directory, if specified
              String sLanguage = "dut";                     // This is the language abbreviation used in [osrMovie.cs] for sBaseUrl
              String sDict = "";        // Movie dictionary
              bool bIsDebug = false;    // Debugging
              bool bForce = false;      // Force
              bool bOview = false;      // Make overview or not
              bool bSkip = false;       // Skip everything that has *not* been made
              String sAction = "cmdi";  // Type of action to be taken

              try {
            // Check command-line options
            for (int i = 0; i < args.Length; i++) {
              // get this argument
              String sArg = args[i].Trim();
              if (sArg.StartsWith("-")) {
            errHandle.Status("Processing argument ["+sArg+"]");
            // Check out the arguments
            switch (sArg.Substring(1)) {
              case "i": // Input file or directory with .folia.xml files
                sInput = args[++i];
                break;
              case "f": // Force
                bForce = true;
                break;
              case "s": // Skip
                bSkip = true;
                break;
              case "m": // Movie dictionary   -- Tab-separated list from opensubtitles.org
                sDict = args[++i];
                break;
              case "o": // Output directory
                sOutput = args[++i];
                break;
              case "h": // Calculate hashes and add them to existing .cmdi.xml files
                sAction = "hash";
                break;
              case "v": // Make an overview
                bOview = true;
                break;
              case "d": // Debugging
                bIsDebug = true;
                break;
              case "l": // Language (three letter code)
                sLanguage = args[++i];
                break;
            }
              } else if (sArg == "" || sArg == "\r") {
            // Do nothing
              } else {
            // Throw syntax error and leave
            SyntaxError("1 - i=" + i + " args=" + args.Length + " argCurrent=[" + sArg + "]"); return;
              }
            }
            // Check presence of input/output
            if (sInput == "" ) { SyntaxError("2"); return; }

            // Initialize the main entry point for the conversion
            oprConv objConv = new oprConv(errHandle);
            osrMovie objMovie = new osrMovie(errHandle, sLanguage);
            omdbapi objOmdb = new omdbapi(errHandle);

            // Set directory for conversion
            objConv.dirRoot(sOutput);

            // Load the movie dictionary
            if (!objConv.loadMovieDictionary(sDict)) {
              errHandle.DoError("Main", "Could not load movie dictionary from [" + sDict + "]");
              return;
            }

            // Initialise the Treebank Xpath functions, which may make use of tb:matches()
            util.XPathFunctions.conTb.AddNamespace("tb", util.XPathFunctions.TREEBANK_EXTENSIONS);

            // Check if the input is a directory or file
            if (Directory.Exists(sInput)) {
              WalkDirectoryTree(sInput, "*.folia.xml.gz", sInput, bForce, bSkip, bIsDebug, sAction,
            ref objConv, ref objMovie);
            } else {
              // Show we don't have input file
              errHandle.DoError("Main", "Cannot find input file(s) in: " + sInput);
            }
            // Calculate for each file which others are close to it
            // - try to determine the license information for the best matching .cmdi.xml files
            // - add some more meta-information to the .cmdi.xml files
            objConv.findDuplicates(ref lSubInst, 3, ref objOmdb);

            // Create an overview - if required
            if (bOview) {
              String sOview = objConv.getDistanceOview();
              // Save it in a standard file
              String sFileCsv = Path.GetDirectoryName(sInput) + "/oview.csv";
              File.WriteAllText(sFileCsv, sOview);
            }
            // Exit the program
            Console.WriteLine("Ready");
              } catch (Exception ex) {
            errHandle.DoError("Main", ex); // Provide standard error message
            throw;
              }
        }