public static Dictionary<string, ProcessedTermEntry> ProcessTerms(Dictionary<string, SimpleTermEntry> terms, Dictionary<string,Dictionary<string,double>> srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1 , bool stemWords = false) { Dictionary<string, ProcessedTermEntry> res = new Dictionary<string, ProcessedTermEntry>(1000); Dictionary<string, int> lowercasedWordDict = new Dictionary<string, int>(1000); if (terms!=null) { Log.Write ("Starting pre-processing of "+terms.Count.ToString()+" "+ lang +" terms.",LogLevelType.LIMITED_OUTPUT); string langKey = translitEntry!=null?((translitEntry.srcLang!=null?translitEntry.srcLang:"")+"_"+(translitEntry.trgLang!=null?translitEntry.trgLang:"")):lang; if (!translitTemp.ContainsKey(langKey)) translitTemp.Add(langKey,new Dictionary<string, List<StringProbabEntry>>()); foreach(string lowerCase in terms.Keys) { string surfaceForm = terms[lowerCase].term; if (!res.ContainsKey(lowerCase)) //TODO: Nothing to do, but be aware that here we allow only the first capitalization of a surface form ... we will ignore other capitalizations. { ProcessedTermEntry pte = new ProcessedTermEntry(); pte.surfaceForm = surfaceForm; pte.concordance = !string.IsNullOrWhiteSpace (terms [lowerCase].conc) ? terms [lowerCase].conc : ""; pte.normMsdSeq = !string.IsNullOrWhiteSpace (terms [lowerCase].normMsdSeq) ? new List<string> (whitespaceRegex.Split (terms [lowerCase].normMsdSeq)) : new List<string> (); pte.normSeq = !string.IsNullOrWhiteSpace (terms [lowerCase].normSeq) ? new List<string>(whitespaceRegex.Split(terms[lowerCase].normSeq)):new List<string>(); pte.lowercaceForm = lowerCase; pte.surfaceFormWords = !string.IsNullOrWhiteSpace (surfaceForm) ? new List<string> (whitespaceRegex.Split (surfaceForm)) : new List<string> (); string[] lowerCaseWordArr = !string.IsNullOrWhiteSpace (lowerCase) ? whitespaceRegex.Split (lowerCase) : null; if (lowerCaseWordArr!=null) { pte.lowercaseWords.InsertRange(0,lowerCaseWordArr); } if (!string.IsNullOrWhiteSpace(terms[lowerCase].lemmaSeq)) { pte.lemmaSeq = new List<string>(whitespaceRegex.Split(terms[lowerCase].lemmaSeq)); } else { pte.lemmaSeq = new List<string>(); for (int i=0;i<pte.lowercaseWords.Count;i++){pte.lemmaSeq.Add("");} } if (!string.IsNullOrWhiteSpace(terms[lowerCase].msdSeq)) { pte.msdSeq = new List<string>(whitespaceRegex.Split(terms[lowerCase].msdSeq)); } else { pte.msdSeq = new List<string>(); for (int i=0;i<pte.lowercaseWords.Count;i++){pte.msdSeq.Add("");} } foreach(string word in lowerCaseWordArr) { pte.len+=word.Length; if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word)) { lowercasedWordDict.Add(word,0); } string stem = null; if (stemWords) { stem = LightweightStemmer.Stem(word,lang); } //if (lang !="en") //{ pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word)); //} //else //{ // pte.simpleTransliteration = pte.lowercaseWords; //} if (srcToTrgDict!=null) { List<StringProbabEntry> currList = new List<StringProbabEntry>(); if (stemWords) { if (srcToTrgDict.ContainsKey(stem)) { foreach(string trgStem in srcToTrgDict[stem].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str=trgStem; spe.probab = srcToTrgDict[stem][trgStem]; currList.Add(spe); } } } else { if (srcToTrgDict.ContainsKey(word)) { foreach(string trgWord in srcToTrgDict[word].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str=trgWord; spe.probab = srcToTrgDict[word][trgWord]; currList.Add(spe); } } } pte.translationList.Add(currList); } } res.Add(lowerCase,pte); } } Dictionary<string, List<StringProbabEntry>> translitDict = new Dictionary<string, List<StringProbabEntry>>(); //if (threadCount<2) //{ translitDict = GetTransliterations(lowercasedWordDict,translitEntry, mosesPath, tempFilePath,threadCount); //Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>(); //CopyTranslits(translitDict,tmp, translitEntry); //} //else //{ // translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //} foreach (string lowerCase in res.Keys) { foreach(string word in res[lowerCase].lowercaseWords) { if (translitDict.ContainsKey(word)) { res[lowerCase].transliterationList.Add(translitDict[word]); } else if (translitTemp.ContainsKey(langKey)&&translitTemp[langKey].ContainsKey(word)) { res[lowerCase].transliterationList.Add(translitTemp[langKey][word]); } else { res[lowerCase].transliterationList.Add(new List<StringProbabEntry>()); } } } //We add a simple data amount threshold in order not to overflow the memory ... if (translitTemp[langKey].Count>=25000) { translitTemp[langKey].Clear(); GC.Collect(); } } return res; }
public static List<ProcessedTermEntry> ProcessTermsList(List<string> terms, Dictionary<string, Dictionary<string, double>> srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1 , bool stemWords = false) { List<ProcessedTermEntry> res = new List<ProcessedTermEntry>(1000); Dictionary<string, int> lowercasedWordDict = new Dictionary<string, int>(1000); if (terms!=null) { string langKey = translitEntry!=null?((translitEntry.srcLang!=null?translitEntry.srcLang:"")+"_"+(translitEntry.trgLang!=null?translitEntry.trgLang:"")):lang; if (!translitTemp.ContainsKey(langKey)) translitTemp.Add(langKey,new Dictionary<string, List<StringProbabEntry>>()); foreach(string surfaceForm in terms) { string lowerCase = surfaceForm.ToLower(); ProcessedTermEntry pte = new ProcessedTermEntry(); pte.surfaceForm = surfaceForm; pte.lowercaceForm = lowerCase; pte.surfaceFormWords = new List<string>(whitespaceRegex.Split(surfaceForm)); string[] lowerCaseWordArr = whitespaceRegex.Split(lowerCase); pte.lowercaseWords.InsertRange(0,lowerCaseWordArr); foreach(string word in lowerCaseWordArr) { pte.len+=word.Length; if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word)) { lowercasedWordDict.Add(word,0); } string stem = null; if (stemWords) { stem = LightweightStemmer.Stem(word,lang); } //if (lang !="en") //{ pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word)); //} //else //{ // pte.simpleTransliteration = pte.lowercaseWords; //} if (srcToTrgDict!=null) { List<StringProbabEntry> currList = new List<StringProbabEntry>(); if (stemWords) { if (srcToTrgDict.ContainsKey(stem)) { foreach(string trgStem in srcToTrgDict[stem].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str=trgStem; spe.probab = srcToTrgDict[stem][trgStem]; currList.Add(spe); } } } else { if (srcToTrgDict.ContainsKey(word)) { foreach(string trgWord in srcToTrgDict[word].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str=trgWord; spe.probab = srcToTrgDict[word][trgWord]; currList.Add(spe); } } } pte.translationList.Add(currList); } } res.Add(pte); } Dictionary<string, List<StringProbabEntry>> translitDict = new Dictionary<string, List<StringProbabEntry>>(); //if (threadCount<2) //{ translitDict = GetTransliterations(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //This is not nice, however necessary due to the multi-threaded execution - the temp list is not updated in the single-thread scenario // Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>(); // CopyTranslits(translitDict,tmp, translitEntry); //} //else //{ // translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //} for(int i=0; i<res.Count;i++) { foreach(string word in res[i].lowercaseWords) { if (translitDict.ContainsKey(word)) { res[i].transliterationList.Add(translitDict[word]); } else if (translitTemp.ContainsKey(langKey)&&translitTemp[langKey].ContainsKey(word)) { res[i].transliterationList.Add(translitTemp[langKey][word]); } else { res[i].transliterationList.Add(new List<StringProbabEntry>()); } } } //We add a simple data amount threshold in order not to overflow the memory ... if (translitTemp[langKey].Count>=50000) { translitTemp[langKey].Clear(); GC.Collect(); } } return res; }
public static List <ProcessedTermEntry> ProcessTermsList(List <string> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false) { List <ProcessedTermEntry> res = new List <ProcessedTermEntry>(1000); Dictionary <string, int> lowercasedWordDict = new Dictionary <string, int>(1000); if (terms != null) { string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang; if (!translitTemp.ContainsKey(langKey)) { translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >()); } foreach (string surfaceForm in terms) { string lowerCase = surfaceForm.ToLower(); ProcessedTermEntry pte = new ProcessedTermEntry(); pte.surfaceForm = surfaceForm; pte.lowercaceForm = lowerCase; pte.surfaceFormWords = new List <string>(whitespaceRegex.Split(surfaceForm)); string[] lowerCaseWordArr = whitespaceRegex.Split(lowerCase); pte.lowercaseWords.InsertRange(0, lowerCaseWordArr); foreach (string word in lowerCaseWordArr) { pte.len += word.Length; if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word)) { lowercasedWordDict.Add(word, 0); } string stem = null; if (stemWords) { stem = LightweightStemmer.Stem(word, lang); } //if (lang !="en") //{ pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word)); //} //else //{ // pte.simpleTransliteration = pte.lowercaseWords; //} if (srcToTrgDict != null) { List <StringProbabEntry> currList = new List <StringProbabEntry>(); if (stemWords) { if (srcToTrgDict.ContainsKey(stem)) { foreach (string trgStem in srcToTrgDict[stem].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str = trgStem; spe.probab = srcToTrgDict[stem][trgStem]; currList.Add(spe); } } } else { if (srcToTrgDict.ContainsKey(word)) { foreach (string trgWord in srcToTrgDict[word].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str = trgWord; spe.probab = srcToTrgDict[word][trgWord]; currList.Add(spe); } } } pte.translationList.Add(currList); } } res.Add(pte); } Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >(); //if (threadCount<2) //{ translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount); //This is not nice, however necessary due to the multi-threaded execution - the temp list is not updated in the single-thread scenario // Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>(); // CopyTranslits(translitDict,tmp, translitEntry); //} //else //{ // translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //} for (int i = 0; i < res.Count; i++) { foreach (string word in res[i].lowercaseWords) { if (translitDict.ContainsKey(word)) { res[i].transliterationList.Add(translitDict[word]); } else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word)) { res[i].transliterationList.Add(translitTemp[langKey][word]); } else { res[i].transliterationList.Add(new List <StringProbabEntry>()); } } } //We add a simple data amount threshold in order not to overflow the memory ... if (translitTemp[langKey].Count >= 50000) { translitTemp[langKey].Clear(); GC.Collect(); } } return(res); }
public static Dictionary <string, ProcessedTermEntry> ProcessTerms(Dictionary <string, SimpleTermEntry> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false) { Dictionary <string, ProcessedTermEntry> res = new Dictionary <string, ProcessedTermEntry>(1000); Dictionary <string, int> lowercasedWordDict = new Dictionary <string, int>(1000); if (terms != null) { Log.Write("Starting pre-processing of " + terms.Count.ToString() + " " + lang + " terms.", LogLevelType.LIMITED_OUTPUT); string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang; if (!translitTemp.ContainsKey(langKey)) { translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >()); } foreach (string lowerCase in terms.Keys) { string surfaceForm = terms[lowerCase].term; if (!res.ContainsKey(lowerCase)) //TODO: Nothing to do, but be aware that here we allow only the first capitalization of a surface form ... we will ignore other capitalizations. { ProcessedTermEntry pte = new ProcessedTermEntry(); pte.surfaceForm = surfaceForm; pte.concordance = !string.IsNullOrWhiteSpace(terms [lowerCase].conc) ? terms [lowerCase].conc : ""; pte.normMsdSeq = !string.IsNullOrWhiteSpace(terms [lowerCase].normMsdSeq) ? new List <string> (whitespaceRegex.Split(terms [lowerCase].normMsdSeq)) : new List <string> (); pte.normSeq = !string.IsNullOrWhiteSpace(terms [lowerCase].normSeq) ? new List <string>(whitespaceRegex.Split(terms[lowerCase].normSeq)):new List <string>(); pte.lowercaceForm = lowerCase; pte.surfaceFormWords = !string.IsNullOrWhiteSpace(surfaceForm) ? new List <string> (whitespaceRegex.Split(surfaceForm)) : new List <string> (); string[] lowerCaseWordArr = !string.IsNullOrWhiteSpace(lowerCase) ? whitespaceRegex.Split(lowerCase) : null; if (lowerCaseWordArr != null) { pte.lowercaseWords.InsertRange(0, lowerCaseWordArr); } if (!string.IsNullOrWhiteSpace(terms[lowerCase].lemmaSeq)) { pte.lemmaSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].lemmaSeq)); } else { pte.lemmaSeq = new List <string>(); for (int i = 0; i < pte.lowercaseWords.Count; i++) { pte.lemmaSeq.Add(""); } } if (!string.IsNullOrWhiteSpace(terms[lowerCase].msdSeq)) { pte.msdSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].msdSeq)); } else { pte.msdSeq = new List <string>(); for (int i = 0; i < pte.lowercaseWords.Count; i++) { pte.msdSeq.Add(""); } } foreach (string word in lowerCaseWordArr) { pte.len += word.Length; if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word)) { lowercasedWordDict.Add(word, 0); } string stem = null; if (stemWords) { stem = LightweightStemmer.Stem(word, lang); } //if (lang !="en") //{ pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word)); //} //else //{ // pte.simpleTransliteration = pte.lowercaseWords; //} if (srcToTrgDict != null) { List <StringProbabEntry> currList = new List <StringProbabEntry>(); if (stemWords) { if (srcToTrgDict.ContainsKey(stem)) { foreach (string trgStem in srcToTrgDict[stem].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str = trgStem; spe.probab = srcToTrgDict[stem][trgStem]; currList.Add(spe); } } } else { if (srcToTrgDict.ContainsKey(word)) { foreach (string trgWord in srcToTrgDict[word].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str = trgWord; spe.probab = srcToTrgDict[word][trgWord]; currList.Add(spe); } } } pte.translationList.Add(currList); } } res.Add(lowerCase, pte); } } Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >(); //if (threadCount<2) //{ translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount); //Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>(); //CopyTranslits(translitDict,tmp, translitEntry); //} //else //{ // translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //} foreach (string lowerCase in res.Keys) { foreach (string word in res[lowerCase].lowercaseWords) { if (translitDict.ContainsKey(word)) { res[lowerCase].transliterationList.Add(translitDict[word]); } else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word)) { res[lowerCase].transliterationList.Add(translitTemp[langKey][word]); } else { res[lowerCase].transliterationList.Add(new List <StringProbabEntry>()); } } } //We add a simple data amount threshold in order not to overflow the memory ... if (translitTemp[langKey].Count >= 25000) { translitTemp[langKey].Clear(); GC.Collect(); } } return(res); }
public static AlignmentInfoElement AlignSingleTermPair(ProcessedTermEntry srcPte, ProcessedTermEntry trgPte) { if (srcPte!=null && trgPte!=null) { AlignmentInfoElement aie = new AlignmentInfoElement(); List<WordAlignmentElement> srcToTrg = new List<WordAlignmentElement>(); List<WordAlignmentElement> trgToSrc = new List<WordAlignmentElement>(); maxStrLen = 0; if (_interlinguaDictUsed && _interlinguaTranslitUsed) { ///Types: /// 0 - dictionary, /// 1 - simple translit, /// 2 - target, /// 3 - translit //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (_lpeConf, srcPte.translationList, trgPte.translationList, srcToTrg, trgToSrc, 0, 0); //Translation is in EN language; SOURCE TRANSLATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (_lpeConf, srcPte.translationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 0, 1); //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (_lpeConf, srcPte.translationList, trgPte.transliterationList, srcToTrg, trgToSrc, 0, 3); //Translation is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (_lpeConf, srcPte.simpleTransliteration, trgPte.translationList, srcToTrg, trgToSrc, 1, 0); //Translation is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (_lpeConf, srcPte.transliterationList, trgPte.translationList, srcToTrg, trgToSrc, 3, 0); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (_lpeConf, srcPte.transliterationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 3, 1); //Transliteration is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (_lpeConf, srcPte.simpleTransliteration, trgPte.transliterationList, srcToTrg, trgToSrc, 1, 3); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (_lpeConf, srcPte.transliterationList, trgPte.transliterationList, srcToTrg, trgToSrc, 3, 3); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (_lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); } else if (_interlinguaTranslitUsed) { //Translation is in target language; SOURCE TRANSLATION vs TARGET AlignStringProbabEntryListToStringList (_lpeConf, srcPte.translationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 0, 2); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (_lpeConf, srcPte.transliterationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 3, 1); //Transliteration is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (_lpeConf, srcPte.simpleTransliteration, trgPte.transliterationList, srcToTrg, trgToSrc, 1, 3); //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLITERATION AlignStringProbabEntryListLists (_lpeConf, srcPte.transliterationList, trgPte.transliterationList, srcToTrg, trgToSrc, 3, 2); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (_lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); //Translation is in target language; SOURCE vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (_lpeConf, srcPte.lowercaseWords, trgPte.translationList, srcToTrg, trgToSrc, 2, 0); } else if (_interlinguaDictUsed) { //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLATION AlignStringProbabEntryListLists (_lpeConf, srcPte.translationList, trgPte.translationList, srcToTrg, trgToSrc, 0, 0); //Translation is in EN language; SOURCE TRANSLATION vs TARGET SIMPLE TRANSLITERATION AlignStringProbabEntryListToStringList (_lpeConf, srcPte.translationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 0, 1); //Translation is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (_lpeConf, srcPte.simpleTransliteration, trgPte.translationList, srcToTrg, trgToSrc, 1, 0); //Transliteration is in target language; SOURCE TRANSLITERATION vs TARGET AlignStringProbabEntryListToStringList (_lpeConf, srcPte.transliterationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 3, 2); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (_lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); //Transliteration is in target language; SOURCE vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (_lpeConf, srcPte.lowercaseWords, trgPte.transliterationList, srcToTrg, trgToSrc, 2, 3); } else { //Translation is in target language; SOURCE TRANSLATION vs TARGET AlignStringProbabEntryListToStringList (_lpeConf, srcPte.translationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 0, 2); //Transliteration is in target language; SOURCE TRANSLITERATION vs TARGET AlignStringProbabEntryListToStringList (_lpeConf, srcPte.transliterationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 3, 2); //Translation is in target language; SOURCE vs TARGET TRANSLATION AlignStringListToStringProbabEntryList (_lpeConf, srcPte.lowercaseWords, trgPte.translationList, srcToTrg, trgToSrc, 2, 0); //Transliteration is in target language; SOURCE vs TARGET TRANSLITERATION AlignStringListToStringProbabEntryList (_lpeConf, srcPte.lowercaseWords, trgPte.transliterationList, srcToTrg, trgToSrc, 2, 3); //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION AlignStringLists (_lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1); } aie.srcToTrgAlignments = srcToTrg; aie.trgToSrcAlignments = trgToSrc; aie.srcEntry = srcPte; aie.trgEntry = trgPte; ConsolidateOverlaps(_lpeConf,aie, _excDict); if(CreateStrListsForEval(_configuration,aie,_srcStopWords,_trgStopWords)) { aie.alignmentScore = EvaluateAlignmentScore(_lpeConf,aie); if (aie.alignmentScore>=_lpeConf.finalAlignmentThr) { //If you wish to debug the process, comment the lines below that clear the alignments... aie.srcToTrgAlignments.Clear(); aie.trgToSrcAlignments.Clear(); aie.consolidatedAlignment.Clear(); aie.srcFile = _srcFile; aie.trgFile = _trgFile; return aie; } } } return null; }
public static double GetProbab(ProcessedTermEntry pte,int id, short type, int typeId) { ///Types: /// 0 - dictionary, /// 1 - simple translit, /// 2 - target, /// 3 - translit if (type==0) { return pte.translationList[id][typeId].probab; } else if (type == 1) { return 1; } else if (type == 2) { return 1; } else if (type == 3) { return pte.transliterationList[id][typeId].probab; } return 1; }
public static string GetCorrectString(ProcessedTermEntry pte,int id, short type, int typeId) { ///Types: /// 0 - dictionary, /// 1 - simple translit, /// 2 - target, /// 3 - translit if (type==0) { return pte.translationList[id][typeId].str; } else if (type == 1) { return pte.simpleTransliteration[id]; } else if (type == 2) { return pte.lowercaseWords[id]; } else if (type == 3) { return pte.transliterationList[id][typeId].str; } return null; }