/// <summary> /// Parses a probabilistic dictionary. /// </summary> /// <returns> /// The dictionary in the following <c>Dictionary</c> format: "Source->Target->Probability". /// </returns> /// <param name='cde'> /// MPAligner Configuration Dictionary Entry describing how to parse the dictionary. /// </param> public static Dictionary <string, Dictionary <string, double> > ParseDictionary(MPAlignerConfigurationDictEntry cde) { string fileName = cde.path; Encoding enc = cde.encoding; char[] sep = cde.separators.ToCharArray(); bool filterDictionary = cde.filterDictionary; NumberFormatInfo nfi = cde.numberFormatInfo; Dictionary <string, Dictionary <string, double> > res = new Dictionary <string, Dictionary <string, double> > (); StreamReader sr = new StreamReader(fileName, enc); //Read the dictionary file to the end. while (!sr.EndOfStream) { string line = sr.ReadLine().Trim(); string[] data = line.Split(sep, StringSplitOptions.RemoveEmptyEntries); //Apply valid entry filtering. if (data.Length == 3 && (!filterDictionary || (IsValidPhrase(data [0]) && IsValidPhrase(data [1])))) //We ignore lines that do not have three constituents - source word/phrase, target word/phrase, probability. { try { double prob = Convert.ToDouble(data [2], nfi); //Apply threshold filtering. if (prob >= cde.variantThreshold) { prob = cde.dictBf.Get(prob); string src = data[0]; src = src.ToLower(); string trg = data[1]; trg = trg.ToLower(); //Apply stemming if required. if (cde.stem) { src = LightweightStemmer.Stem(src, cde.srcLang); trg = LightweightStemmer.Stem(trg, cde.trgLang); } //Add the entry to the Dictionary<string, double>. if (!res.ContainsKey(src)) { res.Add(src, new Dictionary <string, double> ()); } if (!res [src].ContainsKey(trg)) { res [src].Add(trg, prob); } } } catch { } } } //Apply maximum translation hypothesis filtering. //FilterTopEquivalents (cde, res); sr.Close(); return(res); }
public static Dictionary <string, bool> ParseStopwordList(MPAlignerConfigurationStopWordListEntry cswle) { string fileName = cswle.path; Encoding enc = cswle.encoding; Dictionary <string, bool> res = new Dictionary <string, bool> (); StreamReader sr = new StreamReader(fileName, enc); //Read the dictionary file to the end. while (!sr.EndOfStream) { string line = sr.ReadLine().Trim(); if (cswle.stem) { line = LightweightStemmer.Stem(line, cswle.lang); } if (!res.ContainsKey(line)) { res.Add(line, true); } } return(res); }
public static Dictionary <string, Dictionary <string, bool> > ParseExceptionDictionary(MPAlignerConfigurationExceptionEntry cee) { string fileName = cee.path; Encoding enc = cee.encoding; char[] sep = { '\t', ' ' }; Dictionary <string, Dictionary <string, bool> > res = new Dictionary <string, Dictionary <string, bool> > (); StreamReader sr = new StreamReader(fileName, enc); //Read the dictionary file to the end. while (!sr.EndOfStream) { string line = sr.ReadLine().Trim(); string[] data = line.Split(sep, StringSplitOptions.RemoveEmptyEntries); //Apply valid entry filtering. if (data.Length >= 2) //We ignore lines that do not have two constituents - source word/phrase, target word/phrase. { string src = data[0].ToLower(); string trg = data[1].ToLower(); //Apply stemming if required. if (cee.stem) { src = LightweightStemmer.Stem(src, cee.srcLang); trg = LightweightStemmer.Stem(trg, cee.trgLang); } //Add the entry to the Dictionary<string, double>. if (!res.ContainsKey(src)) { res.Add(src, new Dictionary <string, bool> ()); } if (!res [src].ContainsKey(trg)) { res [src].Add(trg, true); } } } return(res); }
public static List <ProcessedTermEntry> ProcessTermsList(List <string> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false) { List <ProcessedTermEntry> res = new List <ProcessedTermEntry>(1000); Dictionary <string, int> lowercasedWordDict = new Dictionary <string, int>(1000); if (terms != null) { string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang; if (!translitTemp.ContainsKey(langKey)) { translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >()); } foreach (string surfaceForm in terms) { string lowerCase = surfaceForm.ToLower(); ProcessedTermEntry pte = new ProcessedTermEntry(); pte.surfaceForm = surfaceForm; pte.lowercaceForm = lowerCase; pte.surfaceFormWords = new List <string>(whitespaceRegex.Split(surfaceForm)); string[] lowerCaseWordArr = whitespaceRegex.Split(lowerCase); pte.lowercaseWords.InsertRange(0, lowerCaseWordArr); foreach (string word in lowerCaseWordArr) { pte.len += word.Length; if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word)) { lowercasedWordDict.Add(word, 0); } string stem = null; if (stemWords) { stem = LightweightStemmer.Stem(word, lang); } //if (lang !="en") //{ pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word)); //} //else //{ // pte.simpleTransliteration = pte.lowercaseWords; //} if (srcToTrgDict != null) { List <StringProbabEntry> currList = new List <StringProbabEntry>(); if (stemWords) { if (srcToTrgDict.ContainsKey(stem)) { foreach (string trgStem in srcToTrgDict[stem].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str = trgStem; spe.probab = srcToTrgDict[stem][trgStem]; currList.Add(spe); } } } else { if (srcToTrgDict.ContainsKey(word)) { foreach (string trgWord in srcToTrgDict[word].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str = trgWord; spe.probab = srcToTrgDict[word][trgWord]; currList.Add(spe); } } } pte.translationList.Add(currList); } } res.Add(pte); } Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >(); //if (threadCount<2) //{ translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount); //This is not nice, however necessary due to the multi-threaded execution - the temp list is not updated in the single-thread scenario // Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>(); // CopyTranslits(translitDict,tmp, translitEntry); //} //else //{ // translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //} for (int i = 0; i < res.Count; i++) { foreach (string word in res[i].lowercaseWords) { if (translitDict.ContainsKey(word)) { res[i].transliterationList.Add(translitDict[word]); } else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word)) { res[i].transliterationList.Add(translitTemp[langKey][word]); } else { res[i].transliterationList.Add(new List <StringProbabEntry>()); } } } //We add a simple data amount threshold in order not to overflow the memory ... if (translitTemp[langKey].Count >= 50000) { translitTemp[langKey].Clear(); GC.Collect(); } } return(res); }
public static Dictionary <string, ProcessedTermEntry> ProcessTerms(Dictionary <string, SimpleTermEntry> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false) { Dictionary <string, ProcessedTermEntry> res = new Dictionary <string, ProcessedTermEntry>(1000); Dictionary <string, int> lowercasedWordDict = new Dictionary <string, int>(1000); if (terms != null) { Log.Write("Starting pre-processing of " + terms.Count.ToString() + " " + lang + " terms.", LogLevelType.LIMITED_OUTPUT); string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang; if (!translitTemp.ContainsKey(langKey)) { translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >()); } foreach (string lowerCase in terms.Keys) { string surfaceForm = terms[lowerCase].term; if (!res.ContainsKey(lowerCase)) //TODO: Nothing to do, but be aware that here we allow only the first capitalization of a surface form ... we will ignore other capitalizations. { ProcessedTermEntry pte = new ProcessedTermEntry(); pte.surfaceForm = surfaceForm; pte.concordance = !string.IsNullOrWhiteSpace(terms [lowerCase].conc) ? terms [lowerCase].conc : ""; pte.normMsdSeq = !string.IsNullOrWhiteSpace(terms [lowerCase].normMsdSeq) ? new List <string> (whitespaceRegex.Split(terms [lowerCase].normMsdSeq)) : new List <string> (); pte.normSeq = !string.IsNullOrWhiteSpace(terms [lowerCase].normSeq) ? new List <string>(whitespaceRegex.Split(terms[lowerCase].normSeq)):new List <string>(); pte.lowercaceForm = lowerCase; pte.surfaceFormWords = !string.IsNullOrWhiteSpace(surfaceForm) ? new List <string> (whitespaceRegex.Split(surfaceForm)) : new List <string> (); string[] lowerCaseWordArr = !string.IsNullOrWhiteSpace(lowerCase) ? whitespaceRegex.Split(lowerCase) : null; if (lowerCaseWordArr != null) { pte.lowercaseWords.InsertRange(0, lowerCaseWordArr); } if (!string.IsNullOrWhiteSpace(terms[lowerCase].lemmaSeq)) { pte.lemmaSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].lemmaSeq)); } else { pte.lemmaSeq = new List <string>(); for (int i = 0; i < pte.lowercaseWords.Count; i++) { pte.lemmaSeq.Add(""); } } if (!string.IsNullOrWhiteSpace(terms[lowerCase].msdSeq)) { pte.msdSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].msdSeq)); } else { pte.msdSeq = new List <string>(); for (int i = 0; i < pte.lowercaseWords.Count; i++) { pte.msdSeq.Add(""); } } foreach (string word in lowerCaseWordArr) { pte.len += word.Length; if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word)) { lowercasedWordDict.Add(word, 0); } string stem = null; if (stemWords) { stem = LightweightStemmer.Stem(word, lang); } //if (lang !="en") //{ pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word)); //} //else //{ // pte.simpleTransliteration = pte.lowercaseWords; //} if (srcToTrgDict != null) { List <StringProbabEntry> currList = new List <StringProbabEntry>(); if (stemWords) { if (srcToTrgDict.ContainsKey(stem)) { foreach (string trgStem in srcToTrgDict[stem].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str = trgStem; spe.probab = srcToTrgDict[stem][trgStem]; currList.Add(spe); } } } else { if (srcToTrgDict.ContainsKey(word)) { foreach (string trgWord in srcToTrgDict[word].Keys) { StringProbabEntry spe = new StringProbabEntry(); spe.str = trgWord; spe.probab = srcToTrgDict[word][trgWord]; currList.Add(spe); } } } pte.translationList.Add(currList); } } res.Add(lowerCase, pte); } } Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >(); //if (threadCount<2) //{ translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount); //Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>(); //CopyTranslits(translitDict,tmp, translitEntry); //} //else //{ // translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount); //} foreach (string lowerCase in res.Keys) { foreach (string word in res[lowerCase].lowercaseWords) { if (translitDict.ContainsKey(word)) { res[lowerCase].transliterationList.Add(translitDict[word]); } else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word)) { res[lowerCase].transliterationList.Add(translitTemp[langKey][word]); } else { res[lowerCase].transliterationList.Add(new List <StringProbabEntry>()); } } } //We add a simple data amount threshold in order not to overflow the memory ... if (translitTemp[langKey].Count >= 25000) { translitTemp[langKey].Clear(); GC.Collect(); } } return(res); }