public static Dictionary<string, List<StringProbabEntry>> GetTransliterations(Dictionary<string, int> lowerCasedTerms, MPAlignerConfigurationTranslEntry translEntry, string mosesPath, string tempFilePath, int threadCount) { Dictionary<string, List<StringProbabEntry>> res = new Dictionary<string, List<StringProbabEntry>> (); if (translEntry == null || lowerCasedTerms == null || lowerCasedTerms.Count < 1 || string.IsNullOrWhiteSpace (mosesPath) || string.IsNullOrWhiteSpace (tempFilePath)) { return res; } string langKey = translEntry != null ? ((translEntry.srcLang != null ? translEntry.srcLang : "") + "_" + (translEntry.trgLang != null ? translEntry.trgLang : "")) : ""; Log.Write ("Starting transliteration of " + lowerCasedTerms.Count.ToString () + " tokens.", LogLevelType.LIMITED_OUTPUT); int idx = 0; List<List<string>> lowerCasedTermDictList = new List<List<string>> (threadCount); for (int i=0; i<threadCount; i++) { lowerCasedTermDictList.Add (new List<string> ()); } foreach (string term in lowerCasedTerms.Keys) { lowerCasedTermDictList [idx % threadCount].Add (term); idx++; } string directory = Path.GetDirectoryName (mosesPath); List<Process> processes = new List<Process> (); for (int i=0; i<lowerCasedTermDictList.Count; i++) { if (lowerCasedTermDictList [i].Count > 0) { try { string tmpFile = tempFilePath + i.ToString () + ".tmp"; WriteWordsForTransliteration (lowerCasedTermDictList [i], tmpFile); ProcessStartInfo myProcessStartInfo = new ProcessStartInfo (mosesPath); myProcessStartInfo.UseShellExecute = false; myProcessStartInfo.WorkingDirectory = directory; myProcessStartInfo.FileName = mosesPath; myProcessStartInfo.CreateNoWindow = true; myProcessStartInfo.RedirectStandardOutput = true; myProcessStartInfo.RedirectStandardError = true; StringBuilder sb = new StringBuilder (); sb.Append (" -f "); sb.Append ("\"" + translEntry.mosesIniPath + "\" "); sb.Append (" -i "); sb.Append ("\"" + tmpFile + "\" "); sb.Append (" -n-best-list "); sb.Append ("\"" + tmpFile + ".n_best\" " + translEntry.nBest.ToString ()); myProcessStartInfo.Arguments = sb.ToString (); processes.Add (new Process ()); processes [processes.Count - 1].StartInfo = myProcessStartInfo; bool started = processes [processes.Count - 1].Start (); processes [processes.Count - 1].ErrorDataReceived += p_ErrorDataReceived; processes [processes.Count - 1].OutputDataReceived += p_OutputDataReceived; processes [processes.Count - 1].BeginOutputReadLine (); processes [processes.Count - 1].BeginErrorReadLine (); } catch { } } } for (int i=0; i<processes.Count; i++) { processes [i].WaitForExit (); processes [i].Close (); processes [i].Dispose (); } processes.Clear (); for (int i=0; i<lowerCasedTermDictList.Count; i++) { if (lowerCasedTermDictList[i].Count > 0) { string tmpFile = tempFilePath + i.ToString () + ".tmp"; if (File.Exists (tmpFile + ".n_best")) { NumberFormatInfo nfi = new NumberFormatInfo (); nfi.CurrencyDecimalSeparator = "."; nfi.NumberDecimalSeparator = "."; nfi.PercentDecimalSeparator = "."; Dictionary<string,Dictionary<string,bool>> existingTranslits = new Dictionary<string, Dictionary<string,bool>> (); StreamReader sr = new StreamReader (tmpFile + ".n_best", Encoding.UTF8); string[] sep = {"|||"}; while (!sr.EndOfStream) { string line = sr.ReadLine (); string[] dataArr = line.Split (sep, StringSplitOptions.RemoveEmptyEntries); if (dataArr.Length == 4) { try { string idStr = dataArr [0]; idStr = idStr.Trim (); int id = Convert.ToInt32 (idStr); string word = dataArr [1]; StringProbabEntry spe = new StringProbabEntry (); spe.str = word.Trim ().Replace (" ", ""); string probabStr = dataArr [3]; probabStr = probabStr.Trim ().Replace (',', '.'); spe.probab = Math.Exp (Convert.ToDouble (probabStr, nfi)); if (spe.probab>1) spe.probab = 1; if (id < lowerCasedTermDictList[i].Count) { string term = lowerCasedTermDictList[i][id]; double min = Math.Min (spe.str.Length, term.Length); double max = Math.Max (spe.str.Length, term.Length); double lenDiff = min / max; //Log.Write(term+" "+word+" "+lenDiff.ToString()+" "+spe.probab.ToString(),LogLevelType.ERROR); if (lenDiff >= translEntry.maxLenDiff) { if (!existingTranslits.ContainsKey (term)) existingTranslits.Add (term, new Dictionary<string,bool> ()); if (!res.ContainsKey (term)) res.Add (term, new List<StringProbabEntry> ()); if (!translitTemp[langKey].ContainsKey(term)) translitTemp[langKey].Add(term, new List<StringProbabEntry>()); if (!existingTranslits [term].ContainsKey (spe.str) && spe.probab >= translEntry.threshold) { spe.probab = translEntry.translitBf.Get (spe.probab); existingTranslits [term].Add (spe.str, true); res [term].Add (spe); translitTemp[langKey][term].Add(spe); } } } } catch { } } } } try { File.Delete (tmpFile + ".n_best"); File.Delete (tmpFile); } catch { } } } GC.Collect(); GC.WaitForPendingFinalizers(); return res; }
private static void CopyTranslits(Dictionary<string, List<StringProbabEntry>> fromDict, Dictionary<string, List<StringProbabEntry>> toDict, MPAlignerConfigurationTranslEntry translEntry) { string langKey = translEntry != null ? ((translEntry.srcLang != null ? translEntry.srcLang : "") + "_" + (translEntry.trgLang != null ? translEntry.trgLang : "")) : ""; if (fromDict != null) { if (toDict == null) { toDict = new Dictionary<string, List<StringProbabEntry>>(); } foreach(string term in fromDict.Keys) { if (!toDict.ContainsKey(term)) { toDict.Add(term, fromDict[term]); if (!translitTemp[langKey].ContainsKey(term)) translitTemp[langKey].Add(term, new List<StringProbabEntry>()); translitTemp[langKey][term]=fromDict[term]; } } } }
public static List<ProcessedTermEntry> ProcessTermsList(List<string> terms, Dictionary<string, Dictionary<string, double>> srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1 , bool stemWords = false) { List<ProcessedTermEntry> res = new List<ProcessedTermEntry>(1000); Dictionary<string, int> lowercasedWordDict = new Dictionary<string, int>(1000); if (terms!=null) { string langKey = translitEntry!=null?((translitEntry.srcLang!=null?translitEntry.srcLang:"")+"_"+(translitEntry.trgLang!=null?translitEntry.trgLang:"")):lang; if (!translitTemp.ContainsKey(langKey)) translitTemp.Add(langKey,new Dictionary<string, List<StringProbabEntry>>()); foreach(string surfaceForm in terms) { string lowerCase = surfaceForm.ToLower(); ProcessedTermEntry pte = new ProcessedTermEntry(); pte.surfaceForm = surfaceForm; pte.lowercaceForm = lowerCase; pte.surfaceFormWords = new List<string>(whitespaceRegex.Split(surfaceForm)); string[] lowerCaseWordArr = whitespaceRegex.Split(lowerCase); pte.lowercaseWords.InsertRange(0,lowerCaseWordArr); foreach(string word in lowerCaseWordArr) { pte.len+=word.Length; if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word)) { lowercasedWordDict.Add(word,0); } string stem = null; if (stemWords) { stem = LightweightStemmer.Stem(word,lang); } //if (lang !="en") //{ pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word)); //} //else //{ // pte.simpleTransliteration = pte.lowercaseWords; //} if (srcToTrgDict!=null) { List<StringProbabEntry> currList = new List<StringProbabEntry>(); if (stemWords) { if (srcToTrgDict.ContainsKey(stem)) { foreach(string trgStem in srcToTrgDict[stem].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str=trgStem; spe.probab = srcToTrgDict[stem][trgStem]; currList.Add(spe); } } } else { if (srcToTrgDict.ContainsKey(word)) { foreach(string trgWord in srcToTrgDict[word].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str=trgWord; spe.probab = srcToTrgDict[word][trgWord]; currList.Add(spe); } } } pte.translationList.Add(currList); } } res.Add(pte); } Dictionary<string, List<StringProbabEntry>> translitDict = new Dictionary<string, List<StringProbabEntry>>(); //if (threadCount<2) //{ translitDict = GetTransliterations(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //This is not nice, however necessary due to the multi-threaded execution - the temp list is not updated in the single-thread scenario // Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>(); // CopyTranslits(translitDict,tmp, translitEntry); //} //else //{ // translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //} for(int i=0; i<res.Count;i++) { foreach(string word in res[i].lowercaseWords) { if (translitDict.ContainsKey(word)) { res[i].transliterationList.Add(translitDict[word]); } else if (translitTemp.ContainsKey(langKey)&&translitTemp[langKey].ContainsKey(word)) { res[i].transliterationList.Add(translitTemp[langKey][word]); } else { res[i].transliterationList.Add(new List<StringProbabEntry>()); } } } //We add a simple data amount threshold in order not to overflow the memory ... if (translitTemp[langKey].Count>=50000) { translitTemp[langKey].Clear(); GC.Collect(); } } return res; }
public static Dictionary<string, ProcessedTermEntry> ProcessTerms(Dictionary<string, SimpleTermEntry> terms, Dictionary<string,Dictionary<string,double>> srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1 , bool stemWords = false) { Dictionary<string, ProcessedTermEntry> res = new Dictionary<string, ProcessedTermEntry>(1000); Dictionary<string, int> lowercasedWordDict = new Dictionary<string, int>(1000); if (terms!=null) { Log.Write ("Starting pre-processing of "+terms.Count.ToString()+" "+ lang +" terms.",LogLevelType.LIMITED_OUTPUT); string langKey = translitEntry!=null?((translitEntry.srcLang!=null?translitEntry.srcLang:"")+"_"+(translitEntry.trgLang!=null?translitEntry.trgLang:"")):lang; if (!translitTemp.ContainsKey(langKey)) translitTemp.Add(langKey,new Dictionary<string, List<StringProbabEntry>>()); foreach(string lowerCase in terms.Keys) { string surfaceForm = terms[lowerCase].term; if (!res.ContainsKey(lowerCase)) //TODO: Nothing to do, but be aware that here we allow only the first capitalization of a surface form ... we will ignore other capitalizations. { ProcessedTermEntry pte = new ProcessedTermEntry(); pte.surfaceForm = surfaceForm; pte.concordance = !string.IsNullOrWhiteSpace (terms [lowerCase].conc) ? terms [lowerCase].conc : ""; pte.normMsdSeq = !string.IsNullOrWhiteSpace (terms [lowerCase].normMsdSeq) ? new List<string> (whitespaceRegex.Split (terms [lowerCase].normMsdSeq)) : new List<string> (); pte.normSeq = !string.IsNullOrWhiteSpace (terms [lowerCase].normSeq) ? new List<string>(whitespaceRegex.Split(terms[lowerCase].normSeq)):new List<string>(); pte.lowercaceForm = lowerCase; pte.surfaceFormWords = !string.IsNullOrWhiteSpace (surfaceForm) ? new List<string> (whitespaceRegex.Split (surfaceForm)) : new List<string> (); string[] lowerCaseWordArr = !string.IsNullOrWhiteSpace (lowerCase) ? whitespaceRegex.Split (lowerCase) : null; if (lowerCaseWordArr!=null) { pte.lowercaseWords.InsertRange(0,lowerCaseWordArr); } if (!string.IsNullOrWhiteSpace(terms[lowerCase].lemmaSeq)) { pte.lemmaSeq = new List<string>(whitespaceRegex.Split(terms[lowerCase].lemmaSeq)); } else { pte.lemmaSeq = new List<string>(); for (int i=0;i<pte.lowercaseWords.Count;i++){pte.lemmaSeq.Add("");} } if (!string.IsNullOrWhiteSpace(terms[lowerCase].msdSeq)) { pte.msdSeq = new List<string>(whitespaceRegex.Split(terms[lowerCase].msdSeq)); } else { pte.msdSeq = new List<string>(); for (int i=0;i<pte.lowercaseWords.Count;i++){pte.msdSeq.Add("");} } foreach(string word in lowerCaseWordArr) { pte.len+=word.Length; if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word)) { lowercasedWordDict.Add(word,0); } string stem = null; if (stemWords) { stem = LightweightStemmer.Stem(word,lang); } //if (lang !="en") //{ pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word)); //} //else //{ // pte.simpleTransliteration = pte.lowercaseWords; //} if (srcToTrgDict!=null) { List<StringProbabEntry> currList = new List<StringProbabEntry>(); if (stemWords) { if (srcToTrgDict.ContainsKey(stem)) { foreach(string trgStem in srcToTrgDict[stem].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str=trgStem; spe.probab = srcToTrgDict[stem][trgStem]; currList.Add(spe); } } } else { if (srcToTrgDict.ContainsKey(word)) { foreach(string trgWord in srcToTrgDict[word].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str=trgWord; spe.probab = srcToTrgDict[word][trgWord]; currList.Add(spe); } } } pte.translationList.Add(currList); } } res.Add(lowerCase,pte); } } Dictionary<string, List<StringProbabEntry>> translitDict = new Dictionary<string, List<StringProbabEntry>>(); //if (threadCount<2) //{ translitDict = GetTransliterations(lowercasedWordDict,translitEntry, mosesPath, tempFilePath,threadCount); //Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>(); //CopyTranslits(translitDict,tmp, translitEntry); //} //else //{ // translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //} foreach (string lowerCase in res.Keys) { foreach(string word in res[lowerCase].lowercaseWords) { if (translitDict.ContainsKey(word)) { res[lowerCase].transliterationList.Add(translitDict[word]); } else if (translitTemp.ContainsKey(langKey)&&translitTemp[langKey].ContainsKey(word)) { res[lowerCase].transliterationList.Add(translitTemp[langKey][word]); } else { res[lowerCase].transliterationList.Add(new List<StringProbabEntry>()); } } } //We add a simple data amount threshold in order not to overflow the memory ... if (translitTemp[langKey].Count>=25000) { translitTemp[langKey].Clear(); GC.Collect(); } } return res; }
public static bool GetTranslitConfig(MPAlignerConfiguration configuration, string srcLang, string trgLang, out MPAlignerConfigurationTranslEntry srcTranslitConf, out MPAlignerConfigurationTranslEntry trgTranslitConf, out MPAlignerConfigurationTranslEntry srcToTrgTranslitConf, out MPAlignerConfigurationTranslEntry trgToSrcTranslitConf) { Log.Write ("Searching for transliteration configurations.",LogLevelType.LIMITED_OUTPUT,configuration); srcTranslitConf = null; trgTranslitConf = null; srcToTrgTranslitConf = null; trgToSrcTranslitConf = null; string srcLangKey = srcLang+"_en"; string trgLangKey = trgLang+"_en"; string langKey = srcLang+"_"+trgLang; string langKey2 = trgLang+"_"+srcLang; //Define transliteration directions and whether or not to use EN as interlingua. if (configuration.forceEnTranslitInterlingua && configuration.translConfEntryDict.ContainsKey(srcLangKey)&&configuration.translConfEntryDict.ContainsKey(trgLangKey) && configuration.translConfEntryDict[srcLangKey].use && configuration.translConfEntryDict[trgLangKey].use) { srcTranslitConf= configuration.translConfEntryDict[srcLangKey]; trgTranslitConf= configuration.translConfEntryDict[trgLangKey]; Log.Write ("EN interlingua transliteration loaded for language "+srcLang+": "+ configuration.translConfEntryDict[srcLangKey].mosesIniPath, LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("EN interlingua transliteration loaded for language "+trgLang+": "+ configuration.translConfEntryDict[trgLangKey].mosesIniPath, LogLevelType.LIMITED_OUTPUT,configuration); return true; }else if (configuration.forceEnTranslitInterlingua){ Log.Write ("Cannot force EN interlingua transliteration for the pair "+langKey+" as at least one of the interlingua transliteration configurations is disabled or missing!",LogLevelType.WARNING,configuration); Log.Write ("Will try falling back to direct transliteration without the EN interlingua.",LogLevelType.WARNING,configuration); } //If EN interlingua should not be used or one of the interlingua transliteration configurations is missing, try the direct transliteration. bool foundAtLeastOne = false; if (configuration.translConfEntryDict.ContainsKey(langKey) && configuration.translConfEntryDict[langKey].use) { srcToTrgTranslitConf = configuration.translConfEntryDict[langKey]; Log.Write ("Transliteration loaded for language "+srcLang+" into language "+trgLang+": "+ configuration.translConfEntryDict[langKey].mosesIniPath, LogLevelType.LIMITED_OUTPUT,configuration); foundAtLeastOne = true; } else //If the direct dictionary does not exist, log a warning and continue. { Log.Write ("Direct transliteration for the pair "+langKey+" was not found.",LogLevelType.WARNING,configuration); } //If EN interlingua should not be used or one of the interlingua transliteration configurations is missing, try the direct transliteration also in a reverse direction. if (configuration.translConfEntryDict.ContainsKey(langKey2) && configuration.translConfEntryDict[langKey2].use) { trgToSrcTranslitConf = configuration.translConfEntryDict[langKey2]; Log.Write ("Transliteration loaded for language "+trgLang+" into language "+srcLang+": "+ configuration.translConfEntryDict[langKey2].mosesIniPath, LogLevelType.LIMITED_OUTPUT,configuration); foundAtLeastOne = true; } else //If the direct dictionary does not exist, log a warning and continue. { Log.Write ("Direct transliteration for the pair "+langKey2+" was not found.",LogLevelType.WARNING,configuration); } if (!foundAtLeastOne) { Log.Write ("Direct transliteration for the pairs "+langKey+" nor "+langKey2+" were not found. Will try falling back to interlingua transliteration.",LogLevelType.WARNING,configuration); bool interlinguaTranslitLoaded = false; if (configuration.translConfEntryDict.ContainsKey(srcLangKey) && configuration.translConfEntryDict[srcLangKey].use) { srcTranslitConf= configuration.translConfEntryDict[srcLangKey]; interlinguaTranslitLoaded = true; Log.Write ("EN interlingua transliteration loaded for language "+srcLang+": "+ configuration.translConfEntryDict[srcLangKey].mosesIniPath,LogLevelType.LIMITED_OUTPUT,configuration); } if (configuration.translConfEntryDict.ContainsKey(trgLangKey) && configuration.translConfEntryDict[trgLangKey].use) { trgTranslitConf= configuration.translConfEntryDict[trgLangKey]; interlinguaTranslitLoaded = true; Log.Write ("EN interlingua transliteration loaded for language "+trgLang+": "+ configuration.translConfEntryDict[trgLangKey].mosesIniPath,LogLevelType.LIMITED_OUTPUT,configuration); } if (interlinguaTranslitLoaded) { return true; }else if (configuration.forceEnTranslitInterlingua){ Log.Write ("Cannot force EN interlingua transliteration for the pair "+langKey+" as at least one of the interlingua transliteration configurations is disabled or missing!",LogLevelType.WARNING,configuration); Log.Write ("The system will be executed without transliteration.",LogLevelType.WARNING,configuration); } } return false; }
public static void Main(string[] args) { string configFile = null; string method = null; string inputFile = null; string inputFormat = "tagged_plaintext";//Allowed values: tagged_plaintext, preprocessed_terms, term_list string srcInputFile = null; string trgInputFile = null; string srcLang = null; string trgLang = null; string outputFile = null; string consolidatedOutputFile = null; string outputFormat = "";//"tabsep";//Allowed values: ref_tabsep, tabsep, xml string preProcessedTermOutputFile = null;//"/home/marcis/Dropbox/MonoProjects/MPAligner/MPAligner/bin/Debug/testTermData.xml";//null; string tempTranslitFile = null; bool consolidateResults = false; double consolidationThreshold = 0; //bool logPrepData = false; string domainId = ""; string collectionId = ""; //The skipping parameters are just for debugging. Use them only manually! string skipSrc = ""; string skipTrg = ""; MPAlignerConfiguration configuration = null; //Read all configuration parameters from the command line. for (int i=0; i<args.Length; i++) { if ((args [i] == "-c" || args [i] == "--configuration") && args.Length > i + 1) { configFile = args [i + 1]; configuration = new MPAlignerConfiguration (); configuration.Load (configFile); } else if ((args [i] == "-m" || args [i] == "--method") && args.Length > i + 1) { method = args [i + 1]; } else if ((args [i] == "-i" || args [i] == "--input-file") && args.Length > i + 1) { inputFile = args [i + 1]; //} else if (args [i] == "-lp" || args [i] == "--log-pre-processed") { // logPrepData = true; } else if ((args [i] == "-if" || args [i] == "--input-format") && args.Length > i + 1) { inputFormat = args [i + 1]; } else if ((args [i] == "-si" || args [i] == "--source-input") && args.Length > i + 1) { srcInputFile = args [i + 1]; } else if ((args [i] == "-ti" || args [i] == "--target-input") && args.Length > i + 1) { trgInputFile = args [i + 1]; } else if ((args [i] == "-sl" || args [i] == "--source-language") && args.Length > i + 1) { srcLang = MPFramework.MPFrameworkFunctions.GetValidLangString (args [i + 1]); } else if ((args [i] == "-tl" || args [i] == "--target-language") && args.Length > i + 1) { trgLang = MPFramework.MPFrameworkFunctions.GetValidLangString (args [i + 1]); } else if ((args [i] == "-o" || args [i] == "--output-file") && args.Length > i + 1) { outputFile = args [i + 1]; } else if ((args [i] == "-of" || args [i] == "--output-format") && args.Length > i + 1) { outputFormat = args [i + 1]; } else if ((args [i] == "-pto" || args [i] == "--pre-processed-term-output-file") && args.Length > i + 1) { preProcessedTermOutputFile = args [i + 1]; } else if ((args [i] == "-ttf" || args [i] == "--temp-translit-file") && args.Length > i + 1) { tempTranslitFile = args [i + 1]; } else if ((args [i] == "-ss" || args [i] == "--skip-source-file") && args.Length > i + 1) { skipSrc = args [i + 1]; } else if ((args [i] == "-st" || args [i] == "--skip-target-file") && args.Length > i + 1) { skipTrg = args [i + 1]; } else if ((args [i] == "-d_id" || args [i] == "--domain-id") && args.Length > i + 1) { domainId = args [i + 1]; } else if ((args [i] == "-c_id" || args [i] == "--collection-id") && args.Length > i + 1) { collectionId = args [i + 1]; } else if ((args [i] == "-ct" || args [i] == "--consolidation-threshold") && args.Length > i + 1) { //Consolidation works only if the ref_tabsep output format is specified! NumberFormatInfo nfi = new NumberFormatInfo (); nfi.CurrencyDecimalSeparator = "."; nfi.NumberDecimalSeparator = "."; nfi.PercentDecimalSeparator = "."; consolidationThreshold = Convert.ToDouble (args [i + 1], nfi); consolidateResults = true; } } //Break if a method is not defined. if (string.IsNullOrWhiteSpace (method)) { Log.Write ("Method not specified!",LogLevelType.ERROR,configuration); PrintUsage (); return; } //Write a configuration file to the output file if the config method is specified. if (method.ToLower () == "config") { if (string.IsNullOrWhiteSpace (outputFile)) { Log.Write("Output file not specified!",LogLevelType.ERROR,configuration); PrintUsage (); return; } MPAlignerConfiguration conf = new MPAlignerConfiguration (); MPAlignerConfigurationDictEntry cde = new MPAlignerConfigurationDictEntry (); cde.srcLang = "lv"; cde.trgLang = "en"; cde.path = "/home/marcis/TILDE/RESOURCES/DICT/lv_en_noisy"; conf.dictConfEntryDict.Add ("lv_en", cde); cde = new MPAlignerConfigurationDictEntry (); cde.srcLang = "lt"; cde.trgLang = "en"; cde.path = "/home/marcis/TILDE/RESOURCES/DICT/lt_en"; conf.dictConfEntryDict.Add ("lt_en", cde); MPAlignerConfigurationTranslEntry cte = new MPAlignerConfigurationTranslEntry (); cte.mosesIniPath = "/home/marcis/TILDE/RESOURCES/TRANSLIT_WORKING_DIR/LV-EN/lv-en-binarised-model.moses.ini"; cte.srcLang = "lv"; cte.trgLang = "en"; conf.translConfEntryDict.Add ("lv_en", cte); cte = new MPAlignerConfigurationTranslEntry (); cte.mosesIniPath = "/home/marcis/TILDE/RESOURCES/TRANSLIT_WORKING_DIR/LV-EN/lt-en-binarised-model.moses.ini"; cte.srcLang = "lt"; cte.trgLang = "en"; conf.translConfEntryDict.Add ("lt_en", cte); MPAlignerConfigurationLangPairEntry lpe = new MPAlignerConfigurationLangPairEntry (); lpe.srcLang = "lv"; lpe.trgLang = "en"; conf.langPairEntryDict.Add ("lv_en", lpe); lpe = new MPAlignerConfigurationLangPairEntry (); lpe.srcLang = "lt"; lpe.trgLang = "en"; conf.langPairEntryDict.Add ("lt_en", lpe); MPAlignerConfigurationExceptionEntry cee = new MPAlignerConfigurationExceptionEntry (); cee.srcLang = "lv"; cee.trgLang = "en"; cee.path = "/home/marcis/TILDE/RESOURCES/EXC_DICT/lv_en_exc"; conf.excDictEntryDict.Add ("lv_en", cee); cee = new MPAlignerConfigurationExceptionEntry (); cee.srcLang = "lt"; cee.trgLang = "en"; cee.path = "/home/marcis/TILDE/RESOURCES/EXC_DICT/lt_en_exc"; conf.excDictEntryDict.Add ("lt_en", cee); MPAlignerConfigurationStopWordListEntry cswle = new MPAlignerConfigurationStopWordListEntry (); cswle.lang = "lv"; cswle.path = "/home/marcis/TILDE/RESOURCES/STOP_WORD/lv_stop"; conf.stopWordListEntryDict.Add ("lv", cswle); cswle = new MPAlignerConfigurationStopWordListEntry (); cswle.lang = "lt"; cswle.path = "/home/marcis/TILDE/RESOURCES/STOP_WORD/lt_stop"; conf.stopWordListEntryDict.Add ("lt", cswle); cswle = new MPAlignerConfigurationStopWordListEntry (); cswle.lang = "en"; cswle.path = "/home/marcis/TILDE/RESOURCES/STOP_WORD/en_stop"; conf.stopWordListEntryDict.Add ("en", cswle); conf.Save (outputFile); return; } //Try reading the default configuration if none is passed, but if the default configuration can not be found, break. if (string.IsNullOrWhiteSpace (configFile) && File.Exists ("MPAlignerConfig.xml")) { configuration = new MPAlignerConfiguration (); configuration.Load (configFile); } else if (string.IsNullOrWhiteSpace (configFile)) { Log.Write("Configuration file missing in application directory and a substitution runtime configuration file is not specified!",LogLevelType.ERROR,configuration); PrintUsage (); return; } //In the case if an output format is not defined in the command line, read it from the configuration file. if (string.IsNullOrWhiteSpace (outputFormat)) outputFormat = configuration.outputFormat; //In the case if the configuration does not specify an output format, use the default output format. if (string.IsNullOrWhiteSpace (outputFormat)) { outputFormat = "ref_tabsep"; } Log.confLogLevel = configuration.logLevel; if (string.IsNullOrWhiteSpace (tempTranslitFile)) { tempTranslitFile = outputFile+".tmp"; } Log.Write ("configFile: "+(configFile!=null?configFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("method: "+(method!=null?method:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("inputFile: "+(inputFile!=null?inputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("inputFormat: "+(inputFormat!=null?inputFormat:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("srcInputFile: "+(srcInputFile!=null?srcInputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("trgInputFile: "+(trgInputFile!=null?trgInputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("srcLang: "+(srcLang!=null?srcLang:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("trgLang: "+(trgLang!=null?trgLang:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("outputFile: "+(outputFile!=null?outputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("outputFormat: "+(outputFormat!=null?outputFormat:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("preProcessedTermOutputFile: "+(preProcessedTermOutputFile!=null?preProcessedTermOutputFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("tempTranslitFile: "+(tempTranslitFile!=null?tempTranslitFile:""),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("consolidation threshold: "+(consolidateResults?consolidationThreshold.ToString():""),LogLevelType.LIMITED_OUTPUT,configuration); if (outputFormat == "ref_tabsep" && consolidateResults) { consolidatedOutputFile = outputFile; outputFile += ".raw"; } //For document pair-based alignment. if (method.ToLower () == "taggedfilepairs") { char[] sep = {'\t'}; if (string.IsNullOrWhiteSpace(inputFile)||!File.Exists(inputFile)) { Log.Write("Input file list file not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } //Read the alignment thresholds and other language pair specific numerical/single-value parameters. MPAlignerConfigurationLangPairEntry lpeConf = ReadLangPairConfig (srcLang, trgLang, configuration); //The size of the cache may affect the performance of the alignment! Dictionary<string, ProcessedTermEntry> srcTermCache = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> trgTermCache = new Dictionary<string, ProcessedTermEntry>(); bool interlinguaDictUsed = false; bool interlinguaTranslitUsed = false; //Define dictionaries for pre-processing. Dictionary<string, Dictionary<string, double>> srcDict = null; Dictionary<string, Dictionary<string, double>> trgDict = null; Dictionary<string, Dictionary<string, double>> srcToTrgDict = null; Dictionary<string, Dictionary<string, double>> trgToSrcDict = null; //Define transliteration configurations for pre-processing. MPAlignerConfigurationTranslEntry srcTranslitConf = null; MPAlignerConfigurationTranslEntry trgTranslitConf = null; MPAlignerConfigurationTranslEntry srcToTrgTranslitConf = null; MPAlignerConfigurationTranslEntry trgToSrcTranslitConf = null; //Read dictionaries and transliterations. interlinguaDictUsed = ReadDictionaries(configuration,srcLang,trgLang, out srcDict, out trgDict, out srcToTrgDict, out trgToSrcDict); interlinguaTranslitUsed = GetTranslitConfig(configuration,srcLang,trgLang,out srcTranslitConf,out trgTranslitConf,out srcToTrgTranslitConf, out trgToSrcTranslitConf); //Define the alignments (the variable holding alignment results) Dictionary<string,Dictionary<string, AlignmentInfoElement>> alignments = new Dictionary<string, Dictionary<string, AlignmentInfoElement>>(); //Define and read exception dictionaries. Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); //Define and read stopword lists. Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); StreamReader sr = new StreamReader(inputFile,Encoding.UTF8); int pairCounter = 0; bool skip = !string.IsNullOrWhiteSpace(skipSrc)&&!string.IsNullOrWhiteSpace(skipTrg)?true:false; //Read input document alignment file and process file pairs. while(!sr.EndOfStream) { pairCounter++; string line = sr.ReadLine().Trim(); if (string.IsNullOrWhiteSpace(line)) continue; string[] arr = line.Split(sep, StringSplitOptions.RemoveEmptyEntries); if (arr.Length<2) { continue; //If the alignment line does not contain at least two entries, the document alignment is not valid. } string srcFile = arr[0]; string trgFile = arr[1]; if (!File.Exists(srcFile)) { Log.Write("Input file \""+srcFile+"\" cannot be found!",LogLevelType.WARNING,configuration); continue; } if (!File.Exists(trgFile)) { Log.Write("Input file \""+trgFile+"\" cannot be found!",LogLevelType.WARNING,configuration); continue; } string srcFileName = Path.GetFileName(srcFile); string trgFileName = Path.GetFileName(trgFile); //The skipping condition is for debugging - if the system crashes due to insufficient memory... if (skip) { if (srcFileName==skipSrc&&trgFileName == skipTrg) { skip = false; } else { Log.Write("Skipping file pair "+srcFileName+" and " + trgFileName+".",LogLevelType.WARNING,configuration); continue; } } Log.Write("Processing file pair "+srcFileName+" and " + trgFileName+".",LogLevelType.LIMITED_OUTPUT,configuration); //Define term entry data variables (used for sotring terms in pre-pre-processed and pre-processed states). Dictionary<string,SimpleTermEntry> srcInitialList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string,SimpleTermEntry> trgInitialList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string,SimpleTermEntry> srcInitialTempList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string,SimpleTermEntry> trgInitialTempList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string, ProcessedTermEntry> srcTermList = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> trgTermList = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> srcTermTempList = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> trgTermTempList = new Dictionary<string, ProcessedTermEntry>(); //Two input formats are currently supported - term-tagged plaintext files and term list (one term per line) files. if (inputFormat=="tagged_plaintext") { //Read terms from the term-tagged documents. srcInitialTempList = TermTaggedFileParser.ParseTermTaggedFile(srcFile,Encoding.UTF8, configuration.concLen); trgInitialTempList = TermTaggedFileParser.ParseTermTaggedFile(trgFile,Encoding.UTF8, configuration.concLen); } else { //Read terms from the term list files. srcInitialTempList = ListFileParser.Parse(srcFile,Encoding.UTF8); trgInitialTempList = ListFileParser.Parse(trgFile,Encoding.UTF8); } //Search for already pre-processed source terms in the cache. foreach(string term in srcInitialTempList.Keys) { string lower = term.ToLower(); if (srcTermCache.ContainsKey(lower)) { if (!srcTermList.ContainsKey(lower)) srcTermList.Add(lower, srcTermCache[lower]); } else { srcInitialList.Add(term, srcInitialTempList[term]); } } //Search for already pre-processed target terms in the cache. foreach(string term in trgInitialTempList.Keys) { string lower = term.ToLower(); if (trgTermCache.ContainsKey(lower)) { if (!trgTermList.ContainsKey(lower)) trgTermList.Add(lower, trgTermCache[lower]); } else { trgInitialList.Add(term, trgInitialTempList[term]); } } //Now pre-process terms that have not been pre-processed again. if (srcDict!=null||trgDict!=null) { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermTempList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermTempList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermTempList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermTempList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } else { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermTempList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermTempList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermTempList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermTempList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } //Update the pre-processed term list for alignment. foreach(string srcTerm in srcTermTempList.Keys) { if (!srcTermList.ContainsKey(srcTerm)) srcTermList.Add(srcTerm,srcTermTempList[srcTerm]); if (!srcTermCache.ContainsKey(srcTerm)) srcTermCache.Add(srcTerm, srcTermTempList[srcTerm]); } foreach(string trgTerm in trgTermTempList.Keys) { if (!trgTermList.ContainsKey(trgTerm)) trgTermList.Add(trgTerm,trgTermTempList[trgTerm]); if (!trgTermCache.ContainsKey(trgTerm)) trgTermCache.Add(trgTerm, trgTermTempList[trgTerm]); } //Execute alignment for one file pair. List<AlignmentInfoElement> alignment = new List<AlignmentInfoElement>(); //The execution may be multi-threaded or single-threaded. The multi-threaded execution may be instable. Therefore, be careful when using multi-threading. if (configuration.useMultiThreadedExecution) { alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcFile, trgFile, excDict, srcStopWords, trgStopWords); } else { alignment = Alignment.AlignPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcFile, trgFile, excDict, srcStopWords, trgStopWords); } if (alignment!=null) { foreach(AlignmentInfoElement aie in alignment) { if (!alignments.ContainsKey(aie.srcEntry.lowercaceForm)) { alignments.Add(aie.srcEntry.lowercaceForm, new Dictionary<string, AlignmentInfoElement>()); } if (!alignments[aie.srcEntry.lowercaceForm].ContainsKey(aie.trgEntry.lowercaceForm)) { alignments[aie.srcEntry.lowercaceForm].Add(aie.trgEntry.lowercaceForm, aie); } } } //If pre-processed term cache is full, empty it (this maybe can be imrpoved with the help of some sort of a flowing cache (always circulating). if (srcTermCache.Count>50000) { srcTermCache.Clear(); srcTermCache = new Dictionary<string, ProcessedTermEntry>(); GC.Collect(); GC.WaitForPendingFinalizers(); } if (trgTermCache.Count>50000) { trgTermCache.Clear(); trgTermCache = new Dictionary<string, ProcessedTermEntry>(); GC.Collect(); GC.WaitForPendingFinalizers(); } //After each 50 pairs, print rsults. if (pairCounter%50==0||alignments.Count>50000) { Log.Write("Printing intermediate results after "+pairCounter.ToString()+" file pairs",LogLevelType.LIMITED_OUTPUT,configuration); List<AlignmentInfoElement> resAlignment = new List<AlignmentInfoElement>(); foreach(string src in alignments.Keys) { foreach(string trg in alignments[src].Keys) { resAlignment.Add(alignments[src][trg]); } } AlignmentInfoElement.AppendList(outputFormat,outputFile,resAlignment,lpeConf,srcLang,trgLang,collectionId,domainId); alignments.Clear(); alignments = new Dictionary<string, Dictionary<string, AlignmentInfoElement>>(); GC.Collect(); GC.WaitForPendingFinalizers(); } } sr.Close(); //If there are alignments left, write them to the output file. if (!string.IsNullOrWhiteSpace(outputFile)) { Log.Write("Printing final results after "+pairCounter.ToString()+" file pairs",LogLevelType.LIMITED_OUTPUT,configuration); List<AlignmentInfoElement> resAlignment = new List<AlignmentInfoElement>(); foreach(string src in alignments.Keys) { foreach(string trg in alignments[src].Keys) { resAlignment.Add(alignments[src][trg]); } } AlignmentInfoElement.AppendList(outputFormat,outputFile,resAlignment,lpeConf,srcLang,trgLang,collectionId,domainId); } } else if (method.ToLower () == "singletaggedpair") //TODO: REFACTOR (the file pair list processing could be handled (wisely) through a single file pair processing method!!! { //Define the instances of source and target processed term lists. Dictionary<string, ProcessedTermEntry> srcTermList = new Dictionary<string, ProcessedTermEntry>(); Dictionary<string, ProcessedTermEntry> trgTermList = new Dictionary<string, ProcessedTermEntry>(); bool interlinguaDictUsed = false; bool interlinguaTranslitUsed = false; if (inputFormat=="preprocessed_terms") { if (string.IsNullOrWhiteSpace(inputFile)||!File.Exists(inputFile)) { Log.Write("Pre-processed term input file not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } PreprocessedTermData ptd = PreprocessedTermData.ReadFromFile(inputFile); foreach(ProcessedTermEntry pte in ptd.srcTerms) { if(!srcTermList.ContainsKey(pte.lowercaceForm)) { srcTermList.Add(pte.lowercaceForm,pte); } } foreach(ProcessedTermEntry pte in ptd.trgTerms) { if(!trgTermList.ContainsKey(pte.lowercaceForm)) { trgTermList.Add(pte.lowercaceForm,pte); } } srcLang = ptd.srcLang; trgLang = ptd.trgLang; interlinguaDictUsed = ptd.interlinguaDictUsed; interlinguaTranslitUsed = ptd.interlinguaTranslitUsed; Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); if (!string.IsNullOrWhiteSpace(outputFile)) { List<AlignmentInfoElement> alignment = new List<AlignmentInfoElement>(); if (configuration.useMultiThreadedExecution) { alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } else { alignment = Alignment.AlignPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } AlignmentInfoElement.PrintList(outputFormat,outputFile,alignment, configuration.printTopTrgForSrc,null,srcLang,trgLang,collectionId,domainId); } } else if (inputFormat=="term_list"||inputFormat=="tagged_plaintext") { if (string.IsNullOrWhiteSpace(srcInputFile)||!File.Exists(srcInputFile)||string.IsNullOrWhiteSpace(trgInputFile)||!File.Exists(trgInputFile)) { Log.Write("Source and/or target files not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } Dictionary<string,SimpleTermEntry> srcInitialList = new Dictionary<string, SimpleTermEntry>(); Dictionary<string,SimpleTermEntry> trgInitialList = new Dictionary<string, SimpleTermEntry>(); if (inputFormat=="tagged_plaintext") { srcInitialList = TermTaggedFileParser.ParseTermTaggedFile(srcInputFile,Encoding.UTF8, configuration.concLen); trgInitialList = TermTaggedFileParser.ParseTermTaggedFile(trgInputFile,Encoding.UTF8, configuration.concLen); } else { srcInitialList = ListFileParser.Parse(srcInputFile,Encoding.UTF8); trgInitialList = ListFileParser.Parse(trgInputFile,Encoding.UTF8); } Log.Write ("Unprocessed source terms: "+srcInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Unprocessed target terms: "+trgInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string, Dictionary<string, double>> srcDict = null; Dictionary<string, Dictionary<string, double>> trgDict = null; Dictionary<string, Dictionary<string, double>> srcToTrgDict = null; Dictionary<string, Dictionary<string, double>> trgToSrcDict = null; MPAlignerConfigurationTranslEntry srcTranslitConf = null; MPAlignerConfigurationTranslEntry trgTranslitConf = null; MPAlignerConfigurationTranslEntry srcToTrgTranslitConf = null; MPAlignerConfigurationTranslEntry trgToSrcTranslitConf = null; interlinguaDictUsed = ReadDictionaries(configuration,srcLang,trgLang, out srcDict, out trgDict, out srcToTrgDict, out trgToSrcDict); interlinguaTranslitUsed = GetTranslitConfig(configuration,srcLang,trgLang,out srcTranslitConf,out trgTranslitConf,out srcToTrgTranslitConf, out trgToSrcTranslitConf); if (srcDict!=null||trgDict!=null) { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } else { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } Log.Write ("Pre-processed source terms: "+srcTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Pre-processed target terms: "+trgTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); ///If pre-processed terms should be saved for future use an output format is created. /// This functionality is not available for the file pair list-based processing. if (!string.IsNullOrWhiteSpace(preProcessedTermOutputFile)) { List<ProcessedTermEntry> srcTerms = new List<ProcessedTermEntry>(srcTermList.Values); List<ProcessedTermEntry> trgTerms = new List<ProcessedTermEntry>(trgTermList.Values); PreprocessedTermData ptd = new PreprocessedTermData(); ptd.interlinguaDictUsed = interlinguaDictUsed; ptd.interlinguaTranslitUsed = interlinguaTranslitUsed; ptd.srcTerms = srcTerms.ToArray(); ptd.trgTerms = trgTerms.ToArray(); ptd.srcLang = srcLang; ptd.trgLang = trgLang; string outStr = MPFramework.MPFrameworkFunctions.SerializeObjectInstance<PreprocessedTermData>(ptd); File.WriteAllText(preProcessedTermOutputFile,outStr); } Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); if (!string.IsNullOrWhiteSpace(outputFile)) { List<AlignmentInfoElement> alignment = new List<AlignmentInfoElement>(); if (configuration.useMultiThreadedExecution) { alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } else { alignment = Alignment.AlignPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } AlignmentInfoElement.PrintList(outputFormat,outputFile,alignment, configuration.printTopTrgForSrc,null,srcLang,trgLang,collectionId,domainId); } } else { Log.Write ("Input format UNKNOWN or UNDEFINED.",LogLevelType.ERROR,configuration); return; } } else if (method.ToLower () == "singletermpairlist") //Use this method only if filtering of term pairs or some sort of evaluation is necessary! { //Define the instances of source and target processed term lists. List<ProcessedTermEntry> srcTermList = new List<ProcessedTermEntry>(); List<ProcessedTermEntry> trgTermList = new List<ProcessedTermEntry>(); bool interlinguaDictUsed = false; bool interlinguaTranslitUsed = false; if (inputFormat=="preprocessed_terms") { if (string.IsNullOrWhiteSpace(inputFile)||!File.Exists(inputFile)) { Log.Write("Pre-processed term input file not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } PreprocessedTermData ptd = PreprocessedTermData.ReadFromFile(inputFile); srcTermList.AddRange(ptd.srcTerms); trgTermList.AddRange(ptd.trgTerms); Log.Write ("Pre-processed source terms: "+srcTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Pre-processed target terms: "+trgTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); srcLang = ptd.srcLang; trgLang = ptd.trgLang; interlinguaDictUsed = ptd.interlinguaDictUsed; interlinguaTranslitUsed = ptd.interlinguaTranslitUsed; Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Log.Write ("Exception dictionary entries: "+excDict.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Log.Write ("Source language stopwords: "+srcStopWords.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); Log.Write ("Target language stopwords: "+trgStopWords.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); if (!string.IsNullOrWhiteSpace(outputFile)) { List<AlignmentInfoElement> alignment = Alignment.AlignListPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); Log.Write ("Alignment elements after alignment: "+alignment.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); AlignmentInfoElement.PrintList(outputFormat,outputFile,alignment, configuration.printTopTrgForSrc,null,srcLang,trgLang,collectionId,domainId); } } else { if (string.IsNullOrWhiteSpace(srcInputFile)||!File.Exists(srcInputFile)||string.IsNullOrWhiteSpace(trgInputFile)||!File.Exists(trgInputFile)) { Log.Write("Source and/or target files not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source and/or target languages not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } List<string> srcInitialList = new List<string>(); List<string> trgInitialList = new List<string>(); srcInitialList = ListFileParser.ParseList(srcInputFile,Encoding.UTF8); trgInitialList = ListFileParser.ParseList(trgInputFile,Encoding.UTF8); if (srcInitialList.Count!=trgInitialList.Count) { Log.Write("Source and target term lists are with different lengths",LogLevelType.ERROR,configuration); throw new ArgumentException("Source and target term lists are with different lengths"); } Log.Write ("Unprocessed source terms: "+srcInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Unprocessed target terms: "+trgInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string, Dictionary<string, double>> srcDict = null; Dictionary<string, Dictionary<string, double>> trgDict = null; Dictionary<string, Dictionary<string, double>> srcToTrgDict = null; Dictionary<string, Dictionary<string, double>> trgToSrcDict = null; MPAlignerConfigurationTranslEntry srcTranslitConf = null; MPAlignerConfigurationTranslEntry trgTranslitConf = null; MPAlignerConfigurationTranslEntry srcToTrgTranslitConf = null; MPAlignerConfigurationTranslEntry trgToSrcTranslitConf = null; interlinguaDictUsed = ReadDictionaries(configuration,srcLang,trgLang, out srcDict, out trgDict, out srcToTrgDict, out trgToSrcDict); interlinguaTranslitUsed = GetTranslitConfig(configuration,srcLang,trgLang,out srcTranslitConf,out trgTranslitConf,out srcToTrgTranslitConf, out trgToSrcTranslitConf); if (srcDict!=null||trgDict!=null) { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermList = ProcessedTermEntry.ProcessTermsList(srcInitialList,srcDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTermsList(trgInitialList,trgDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermList = ProcessedTermEntry.ProcessTermsList(srcInitialList,srcDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTermsList(trgInitialList,trgDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } else { if (srcTranslitConf!=null && trgTranslitConf!=null) { srcTermList = ProcessedTermEntry.ProcessTermsList(srcInitialList,srcToTrgDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTermsList(trgInitialList,trgToSrcDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { srcTermList = ProcessedTermEntry.ProcessTermsList(srcInitialList,srcToTrgDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); trgTermList = ProcessedTermEntry.ProcessTermsList(trgInitialList,trgToSrcDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } Log.Write ("Pre-processed source terms: "+srcTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Pre-processed target terms: "+trgTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); ///If pre-processed terms should be saved for future use an output format is created. if (!string.IsNullOrWhiteSpace(preProcessedTermOutputFile)) { PreprocessedTermData ptd = new PreprocessedTermData(); ptd.interlinguaDictUsed = interlinguaDictUsed; ptd.interlinguaTranslitUsed = interlinguaTranslitUsed; ptd.srcTerms = srcTermList.ToArray(); ptd.trgTerms = trgTermList.ToArray(); ptd.srcLang = srcLang; ptd.trgLang = trgLang; string outStr = MPFramework.MPFrameworkFunctions.SerializeObjectInstance<PreprocessedTermData>(ptd); File.WriteAllText(preProcessedTermOutputFile,outStr); } Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); if (!string.IsNullOrWhiteSpace(outputFile)) { List<AlignmentInfoElement> alignment = Alignment.AlignListPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); AlignmentInfoElement.PrintList(outputFormat,outputFile,alignment, configuration.printTopTrgForSrc,null,srcLang,trgLang,collectionId,domainId); } } } else if (method.ToLower () == "eurovoceval") { if (string.IsNullOrWhiteSpace(inputFile)||!File.Exists(inputFile)) { Log.Write("Eurovoc input file not specified or cannot be found!",LogLevelType.ERROR,configuration); PrintUsage(); return; } if (string.IsNullOrWhiteSpace(srcLang)||string.IsNullOrWhiteSpace(trgLang)) { Log.Write("Source or target language not specified!",LogLevelType.ERROR,configuration); PrintUsage(); return; } configuration.allowTrimmedAlignments = false; //configuration.useMultiThreadedExecution = false; configuration.printTopTrgForSrc = true; string logFile = outputFile+".res.log"; StreamWriter sw = new StreamWriter(logFile, true, Encoding.UTF8); Dictionary<string,List<string>> eurovocDict = ReadEurovocDict(inputFile);; //List<string> langList = GetLangsFromConf(configuration); //for(int i = 0;i<langList.Count;i++) //{ //for(int j = 0;j<langList.Count;j++) //{ //if (i==j) continue; //srcLang = langList[i]; //trgLang = langList[j]; Log.Write("Processing pair "+srcLang+"_"+trgLang,LogLevelType.LIMITED_OUTPUT,configuration); if (Char.IsDigit(outputFile[outputFile.Length-1])) outputFile = outputFile.Substring(0,outputFile.Length-1); string alignmentOutputFile = outputFile+"."+srcLang+"_"+trgLang+".align.txt"; if (File.Exists(alignmentOutputFile)) { Log.Write("Pair "+srcLang+"_"+trgLang+" already processed! Evaluating...",LogLevelType.LIMITED_OUTPUT,configuration); List<StringComparisonElement> terms = new List<StringComparisonElement>(); StreamReader sr = new StreamReader(alignmentOutputFile,Encoding.UTF8); char[] sep = {'\t'}; NumberFormatInfo nfi = new NumberFormatInfo(); nfi.CurrencyDecimalSeparator="."; nfi.NumberDecimalSeparator="."; nfi.PercentDecimalSeparator="."; while(!sr.EndOfStream) { string line = sr.ReadLine().Trim(); string[] arr = line.Split(sep,StringSplitOptions.None); if (arr.Length>=3) { StringComparisonElement sce = new StringComparisonElement(); sce.src = arr[0]; sce.trg = arr[1]; sce.similarity = Convert.ToDouble(arr[2],nfi); terms.Add(sce); } } sr.Close(); terms.Sort(); List<double> scores = new List<double>(); double tmp = 0; while (tmp<=1) { scores.Add(tmp); tmp+=0.01; } List<double> correct = new List<double>(); for(int t=0;t<scores.Count;t++) { correct.Add(0); } List<double> total = new List<double>(); for(int t=0;t<scores.Count;t++) { total.Add(0); } int totalForRec = 0; Dictionary<string,Dictionary<string,bool>> goldList = new Dictionary<string, Dictionary<string, bool>>(); for (int s = 0;s<eurovocDict[srcLang].Count;s++) { if (!eurovocDict[srcLang][s].Contains("(under translation)")&&!eurovocDict[trgLang][s].Contains("(under translation)")) { totalForRec++; if (!goldList.ContainsKey(eurovocDict[srcLang][s].ToLower())) goldList.Add(eurovocDict[srcLang][s].ToLower(), new Dictionary<string,bool>()); if (!goldList[eurovocDict[srcLang][s].ToLower()].ContainsKey(eurovocDict[trgLang][s].ToLower())) goldList[eurovocDict[srcLang][s].ToLower()].Add(eurovocDict[trgLang][s].ToLower(),true); } } string previousSrc = null; foreach(StringComparisonElement sce in terms) { string currSrc = sce.src; if (previousSrc!=currSrc.ToLower()) { string src = sce.src.ToLower(); string trg = sce.trg.ToLower(); double alignScore = sce.similarity; bool corr = false; if (goldList.ContainsKey(src)&&goldList[src].ContainsKey(trg)) corr = true; for (int s =0;s<scores.Count;s++) { if (scores[s]<=alignScore) { if (corr) correct[s]++; total[s]++; } } previousSrc = currSrc.ToLower(); } } for(int s=0;s<scores.Count;s++) { double corr = correct[s]; double tot = total[s]; double totCorr = totalForRec; double prec = corr/tot*100; double rec = corr/totCorr*100; double f1 = prec*rec*2/(prec+rec); sw.WriteLine(srcLang+"\t"+trgLang+"\t"+scores[s].ToString()+"\t"+corr.ToString()+"\t"+tot.ToString()+"\t"+totCorr.ToString()+"\t"+prec.ToString()+"\t"+rec.ToString()+"\t"+f1.ToString()); } sw.Flush(); //} //} sw.Close(); //continue; return; } string preprocessedOutputFile = outputFile+"."+srcLang+"_"+trgLang+".prep.txt"; Dictionary<string,SimpleTermEntry> srcInitialList = StringListToDict(eurovocDict[srcLang]); Dictionary<string,SimpleTermEntry> trgInitialList = StringListToDict(eurovocDict[trgLang]); Log.Write ("Unprocessed source terms: "+srcInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Unprocessed target terms: "+trgInitialList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Dictionary<string, Dictionary<string, double>> srcDict = null; Dictionary<string, Dictionary<string, double>> trgDict = null; Dictionary<string, Dictionary<string, double>> srcToTrgDict = null; Dictionary<string, Dictionary<string, double>> trgToSrcDict = null; MPAlignerConfigurationTranslEntry srcTranslitConf = null; MPAlignerConfigurationTranslEntry trgTranslitConf = null; MPAlignerConfigurationTranslEntry srcToTrgTranslitConf = null; MPAlignerConfigurationTranslEntry trgToSrcTranslitConf = null; bool interlinguaDictUsed = ReadDictionaries(configuration,srcLang,trgLang, out srcDict, out trgDict, out srcToTrgDict, out trgToSrcDict); bool interlinguaTranslitUsed = GetTranslitConfig(configuration,srcLang,trgLang,out srcTranslitConf,out trgTranslitConf,out srcToTrgTranslitConf, out trgToSrcTranslitConf); Dictionary<string,ProcessedTermEntry> srcTermList = new Dictionary<string,ProcessedTermEntry>(); Dictionary<string,ProcessedTermEntry> trgTermList = new Dictionary<string,ProcessedTermEntry>(); if (File.Exists(preprocessedOutputFile)) { Log.Write("Preprocessed term data found! Reading pre-processed data to save time!", LogLevelType.WARNING,configuration); PreprocessedTermData ptd1 = PreprocessedTermData.ReadFromFile(preprocessedOutputFile); interlinguaDictUsed = ptd1.interlinguaDictUsed; interlinguaTranslitUsed = ptd1.interlinguaTranslitUsed; foreach(ProcessedTermEntry pte in ptd1.srcTerms) { if (!srcTermList.ContainsKey(pte.lowercaceForm)) { srcTermList.Add(pte.lowercaceForm,pte); } } foreach(ProcessedTermEntry pte in ptd1.trgTerms) { if (!trgTermList.ContainsKey(pte.lowercaceForm)) { trgTermList.Add(pte.lowercaceForm,pte); } } } else if (interlinguaDictUsed&&interlinguaTranslitUsed) { string dir = Path.GetDirectoryName(preprocessedOutputFile); if (!dir.EndsWith(Path.DirectorySeparatorChar.ToString())) dir+=Path.DirectorySeparatorChar.ToString(); string prepSrcToTrgFile = dir+"eurovoc_preprocessed_"+srcLang+"_en.xml"; string prepTrgToSrcFile = dir+"eurovoc_preprocessed_"+trgLang+"_en.xml"; if (File.Exists(prepSrcToTrgFile)) { Log.Write ("Reading processed term list: eurovoc_preprocessed_"+srcLang+"_en.xml",LogLevelType.LIMITED_OUTPUT,configuration); srcTermList = ProcessedTermEntry.ReadFromFile(prepSrcToTrgFile); } if (File.Exists(prepTrgToSrcFile)) { Log.Write ("Reading processed term list: eurovoc_preprocessed_"+trgLang+"_en.xml",LogLevelType.LIMITED_OUTPUT,configuration); trgTermList = ProcessedTermEntry.ReadFromFile(prepTrgToSrcFile); } } else if (!interlinguaDictUsed&&!interlinguaTranslitUsed) { string dir = Path.GetDirectoryName(preprocessedOutputFile); if (!dir.EndsWith(Path.DirectorySeparatorChar.ToString())) dir+=Path.DirectorySeparatorChar.ToString(); string prepSrcToTrgFile = dir+"eurovoc_preprocessed_"+srcLang+"_"+trgLang+".xml"; string prepTrgToSrcFile = dir+"eurovoc_preprocessed_"+trgLang+"_"+srcLang+".xml"; if (File.Exists(prepSrcToTrgFile)) { Log.Write ("Reading processed term list: eurovoc_preprocessed_"+srcLang+"_"+trgLang+".xml",LogLevelType.LIMITED_OUTPUT,configuration); srcTermList = ProcessedTermEntry.ReadFromFile(prepSrcToTrgFile); } if (File.Exists(prepTrgToSrcFile)) { Log.Write ("Reading processed term list: eurovoc_preprocessed_"+trgLang+"_"+srcLang+".xml",LogLevelType.LIMITED_OUTPUT,configuration); trgTermList = ProcessedTermEntry.ReadFromFile(prepTrgToSrcFile); } } if (srcDict!=null||trgDict!=null) { if (srcTranslitConf!=null && trgTranslitConf!=null) { if (srcTermList.Count<1) srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); if (trgTermList.Count<1) trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { if (srcTermList.Count<1) srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); if (trgTermList.Count<1) trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } else { if (srcTranslitConf!=null && trgTranslitConf!=null) { if (srcTermList.Count<1) srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); if (trgTermList.Count<1) trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } else { if (srcTermList.Count<1) srcTermList = ProcessedTermEntry.ProcessTerms(srcInitialList,srcToTrgDict,srcLang,srcToTrgTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); if (trgTermList.Count<1) trgTermList = ProcessedTermEntry.ProcessTerms(trgInitialList,trgToSrcDict,trgLang,trgToSrcTranslitConf, configuration.mosesPath, tempTranslitFile,configuration.alignmentThreads); } } Log.Write ("Pre-processed source terms: "+srcTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); Log.Write ("Pre-processed target terms: "+trgTermList.Count.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); ///If pre-processed terms should be saved for future use an output format is created. List<ProcessedTermEntry> srcTerms = new List<ProcessedTermEntry>(srcTermList.Values); List<ProcessedTermEntry> trgTerms = new List<ProcessedTermEntry>(trgTermList.Values); PreprocessedTermData ptd = new PreprocessedTermData(); ptd.interlinguaDictUsed = interlinguaDictUsed; ptd.interlinguaTranslitUsed = interlinguaTranslitUsed; ptd.srcTerms = srcTerms.ToArray(); ptd.trgTerms = trgTerms.ToArray(); ptd.srcLang = srcLang; ptd.trgLang = trgLang; string outStr = MPFramework.MPFrameworkFunctions.SerializeObjectInstance<PreprocessedTermData>(ptd); File.WriteAllText(preprocessedOutputFile,outStr); Dictionary<string, Dictionary<string, bool>> excDict = null; ReadExceptionDictionary(configuration,srcLang, trgLang,out excDict); Dictionary<string,bool> srcStopWords = null; ReadStopwordList(configuration,srcLang,out srcStopWords); Dictionary<string,bool> trgStopWords = null; ReadStopwordList(configuration,trgLang,out trgStopWords); //Need to pre-set the alignment thresholds, otherwise these will be overriden by defaults. MPAlignerConfigurationLangPairEntry lpeConf = ReadLangPairConfig (srcLang, trgLang, configuration); List<AlignmentInfoElement> alignment = new List<AlignmentInfoElement>(); if (configuration.useMultiThreadedExecution) { alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } else { alignment = Alignment.AlignPairs(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); } //Multi-threaded execution is not stable at the moment... //List<AlignmentInfoElement> alignment = Alignment.AlignPairsMultiThreaded(configuration,srcTermList,trgTermList,interlinguaDictUsed,interlinguaTranslitUsed,srcLang,trgLang, srcInputFile, trgInputFile, excDict, srcStopWords, trgStopWords); AlignmentInfoElement.PrintList(outputFormat, alignmentOutputFile, alignment, configuration.printTopTrgForSrc, lpeConf,srcLang,trgLang,collectionId,domainId); { List<double> scores = new List<double>(); double tmp = 0; while (tmp<=1) { scores.Add(tmp); tmp+=0.01; } List<double> correct = new List<double>(); for(int t=0;t<scores.Count;t++) { correct.Add(0); } List<double> total = new List<double>(); for(int t=0;t<scores.Count;t++) { total.Add(0); } int totalForRec = 0; Dictionary<string,Dictionary<string,bool>> goldList = new Dictionary<string, Dictionary<string, bool>>(); for (int s = 0;s<eurovocDict[srcLang].Count;s++) { if (!eurovocDict[srcLang][s].ToLower().Contains("(under translation)")&&!eurovocDict[trgLang][s].ToLower().Contains("(under translation)")) { totalForRec++; if (!goldList.ContainsKey(eurovocDict[srcLang][s].ToLower())) goldList.Add(eurovocDict[srcLang][s].ToLower(), new Dictionary<string,bool>()); if (!goldList[eurovocDict[srcLang][s].ToLower()].ContainsKey(eurovocDict[trgLang][s].ToLower())) goldList[eurovocDict[srcLang][s].ToLower()].Add(eurovocDict[trgLang][s].ToLower(),true); } } string previousSrc = null; alignment.Sort(); foreach(AlignmentInfoElement aie in alignment) { string currSrc = AlignmentInfoElement.GetStrFromEntry(aie.srcEntry.surfaceFormWords, aie.minSrcId, aie.maxSrcId); if (previousSrc!=currSrc.ToLower()) { string src = aie.srcEntry.surfaceForm.ToLower(); string trg = aie.trgEntry.surfaceForm.ToLower(); double alignScore = aie.alignmentScore; bool corr = false; if (goldList.ContainsKey(src)&&goldList[src].ContainsKey(trg)) corr = true; for (int s =0;s<scores.Count;s++) { if (scores[s]<=alignScore) { if (corr) correct[s]++; total[s]++; } } previousSrc = currSrc.ToLower(); } } for(int s=0;s<scores.Count;s++) { double corr = correct[s]; double tot = total[s]; double totCorr = totalForRec; double prec = corr/tot*100; double rec = corr/totCorr*100; double f1 = prec*rec*2/(prec+rec); sw.WriteLine(srcLang+"\t"+trgLang+"\t"+scores[s].ToString()+"\t"+corr.ToString()+"\t"+tot.ToString()+"\t"+totCorr.ToString()+"\t"+prec.ToString()+"\t"+rec.ToString()+"\t"+f1.ToString()); } sw.Flush(); //} //} sw.Close(); } } if (File.Exists(tempTranslitFile)) File.Delete(tempTranslitFile); if (consolidateResults) { Log.Write ("Consolidating aligned term pairs with a threshold of: "+consolidationThreshold.ToString(),LogLevelType.LIMITED_OUTPUT,configuration); //In the case if -ct (consolidation threshold) was defined and the output format has been ref_tabsep, the consolidation of results is perfomed. ConsolidationElement.ConsolidateRefTabsep(outputFile, consolidatedOutputFile,consolidationThreshold); } }