/// <summary>
        /// Parses a probabilistic dictionary.
        /// </summary>
        /// <returns>
        /// The dictionary in the following <c>Dictionary</c> format: "Source->Target->Probability".
        /// </returns>
        /// <param name='cde'>
        /// MPAligner Configuration Dictionary Entry describing how to parse the dictionary.
        /// </param>
        public static Dictionary <string, Dictionary <string, double> > ParseDictionary(MPAlignerConfigurationDictEntry cde)
        {
            string   fileName = cde.path;
            Encoding enc      = cde.encoding;

            char[]           sep = cde.separators.ToCharArray();
            bool             filterDictionary = cde.filterDictionary;
            NumberFormatInfo nfi = cde.numberFormatInfo;

            Dictionary <string, Dictionary <string, double> > res = new Dictionary <string, Dictionary <string, double> > ();
            StreamReader sr = new StreamReader(fileName, enc);

            //Read the dictionary file to the end.
            while (!sr.EndOfStream)
            {
                string   line = sr.ReadLine().Trim();
                string[] data = line.Split(sep, StringSplitOptions.RemoveEmptyEntries);
                //Apply valid entry filtering.
                if (data.Length == 3 && (!filterDictionary || (IsValidPhrase(data [0]) && IsValidPhrase(data [1]))))     //We ignore lines that do not have three constituents - source word/phrase, target word/phrase, probability.
                {
                    try {
                        double prob = Convert.ToDouble(data [2], nfi);
                        //Apply threshold filtering.
                        if (prob >= cde.variantThreshold)
                        {
                            prob = cde.dictBf.Get(prob);
                            string src = data[0];
                            src = src.ToLower();
                            string trg = data[1];
                            trg = trg.ToLower();
                            //Apply stemming if required.
                            if (cde.stem)
                            {
                                src = LightweightStemmer.Stem(src, cde.srcLang);
                                trg = LightweightStemmer.Stem(trg, cde.trgLang);
                            }
                            //Add the entry to the Dictionary<string, double>.
                            if (!res.ContainsKey(src))
                            {
                                res.Add(src, new Dictionary <string, double> ());
                            }
                            if (!res [src].ContainsKey(trg))
                            {
                                res [src].Add(trg, prob);
                            }
                        }
                    } catch {
                    }
                }
            }
            //Apply maximum translation hypothesis filtering.
            //FilterTopEquivalents (cde, res);
            sr.Close();
            return(res);
        }
Ejemplo n.º 2
0
        public static Dictionary <string, bool> ParseStopwordList(MPAlignerConfigurationStopWordListEntry cswle)
        {
            string   fileName             = cswle.path;
            Encoding enc                  = cswle.encoding;
            Dictionary <string, bool> res = new Dictionary <string, bool> ();
            StreamReader sr               = new StreamReader(fileName, enc);

            //Read the dictionary file to the end.
            while (!sr.EndOfStream)
            {
                string line = sr.ReadLine().Trim();
                if (cswle.stem)
                {
                    line = LightweightStemmer.Stem(line, cswle.lang);
                }
                if (!res.ContainsKey(line))
                {
                    res.Add(line, true);
                }
            }
            return(res);
        }
Ejemplo n.º 3
0
        public static Dictionary <string, Dictionary <string, bool> > ParseExceptionDictionary(MPAlignerConfigurationExceptionEntry cee)
        {
            string   fileName = cee.path;
            Encoding enc      = cee.encoding;

            char[] sep = { '\t', ' ' };
            Dictionary <string, Dictionary <string, bool> > res = new Dictionary <string, Dictionary <string, bool> > ();
            StreamReader sr = new StreamReader(fileName, enc);

            //Read the dictionary file to the end.
            while (!sr.EndOfStream)
            {
                string   line = sr.ReadLine().Trim();
                string[] data = line.Split(sep, StringSplitOptions.RemoveEmptyEntries);
                //Apply valid entry filtering.
                if (data.Length >= 2)   //We ignore lines that do not have two constituents - source word/phrase, target word/phrase.

                {
                    string src = data[0].ToLower();
                    string trg = data[1].ToLower();
                    //Apply stemming if required.
                    if (cee.stem)
                    {
                        src = LightweightStemmer.Stem(src, cee.srcLang);
                        trg = LightweightStemmer.Stem(trg, cee.trgLang);
                    }
                    //Add the entry to the Dictionary<string, double>.
                    if (!res.ContainsKey(src))
                    {
                        res.Add(src, new Dictionary <string, bool> ());
                    }
                    if (!res [src].ContainsKey(trg))
                    {
                        res [src].Add(trg, true);
                    }
                }
            }
            return(res);
        }
Ejemplo n.º 4
0
        public static List <ProcessedTermEntry> ProcessTermsList(List <string> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false)
        {
            List <ProcessedTermEntry> res = new List <ProcessedTermEntry>(1000);
            Dictionary <string, int>  lowercasedWordDict = new Dictionary <string, int>(1000);

            if (terms != null)
            {
                string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang;
                if (!translitTemp.ContainsKey(langKey))
                {
                    translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >());
                }
                foreach (string surfaceForm in terms)
                {
                    string             lowerCase = surfaceForm.ToLower();
                    ProcessedTermEntry pte       = new ProcessedTermEntry();
                    pte.surfaceForm      = surfaceForm;
                    pte.lowercaceForm    = lowerCase;
                    pte.surfaceFormWords = new List <string>(whitespaceRegex.Split(surfaceForm));
                    string[] lowerCaseWordArr = whitespaceRegex.Split(lowerCase);
                    pte.lowercaseWords.InsertRange(0, lowerCaseWordArr);

                    foreach (string word in lowerCaseWordArr)
                    {
                        pte.len += word.Length;
                        if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word))
                        {
                            lowercasedWordDict.Add(word, 0);
                        }
                        string stem = null;
                        if (stemWords)
                        {
                            stem = LightweightStemmer.Stem(word, lang);
                        }
                        //if (lang !="en")
                        //{
                        pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word));
                        //}
                        //else
                        //{
                        //    pte.simpleTransliteration = pte.lowercaseWords;
                        //}
                        if (srcToTrgDict != null)
                        {
                            List <StringProbabEntry> currList = new List <StringProbabEntry>();
                            if (stemWords)
                            {
                                if (srcToTrgDict.ContainsKey(stem))
                                {
                                    foreach (string trgStem in srcToTrgDict[stem].Keys)
                                    {
                                        StringProbabEntry spe = new StringProbabEntry();
                                        spe.str    = trgStem;
                                        spe.probab = srcToTrgDict[stem][trgStem];
                                        currList.Add(spe);
                                    }
                                }
                            }
                            else
                            {
                                if (srcToTrgDict.ContainsKey(word))
                                {
                                    foreach (string trgWord in srcToTrgDict[word].Keys)
                                    {
                                        StringProbabEntry spe = new StringProbabEntry();
                                        spe.str    = trgWord;
                                        spe.probab = srcToTrgDict[word][trgWord];
                                        currList.Add(spe);
                                    }
                                }
                            }
                            pte.translationList.Add(currList);
                        }
                    }
                    res.Add(pte);
                }
                Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >();
                //if (threadCount<2)
                //{
                translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount);
                //This is not nice, however necessary due to the multi-threaded execution - the temp list is not updated in the single-thread scenario
                //    Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>();
                //    CopyTranslits(translitDict,tmp, translitEntry);
                //}
                //else
                //{
                //    translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                //}
                for (int i = 0; i < res.Count; i++)
                {
                    foreach (string word in res[i].lowercaseWords)
                    {
                        if (translitDict.ContainsKey(word))
                        {
                            res[i].transliterationList.Add(translitDict[word]);
                        }
                        else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word))
                        {
                            res[i].transliterationList.Add(translitTemp[langKey][word]);
                        }
                        else
                        {
                            res[i].transliterationList.Add(new List <StringProbabEntry>());
                        }
                    }
                }
                //We add a simple data amount threshold in order not to overflow the memory ...
                if (translitTemp[langKey].Count >= 50000)
                {
                    translitTemp[langKey].Clear();
                    GC.Collect();
                }
            }
            return(res);
        }
Ejemplo n.º 5
0
        public static Dictionary <string, ProcessedTermEntry> ProcessTerms(Dictionary <string, SimpleTermEntry> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false)
        {
            Dictionary <string, ProcessedTermEntry> res = new Dictionary <string, ProcessedTermEntry>(1000);
            Dictionary <string, int> lowercasedWordDict = new Dictionary <string, int>(1000);

            if (terms != null)
            {
                Log.Write("Starting pre-processing of " + terms.Count.ToString() + " " + lang + " terms.", LogLevelType.LIMITED_OUTPUT);
                string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang;
                if (!translitTemp.ContainsKey(langKey))
                {
                    translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >());
                }
                foreach (string lowerCase in terms.Keys)
                {
                    string surfaceForm = terms[lowerCase].term;
                    if (!res.ContainsKey(lowerCase)) //TODO: Nothing to do, but be aware that here we allow only the first capitalization of a surface form ... we will ignore other capitalizations.
                    {
                        ProcessedTermEntry pte = new ProcessedTermEntry();
                        pte.surfaceForm      = surfaceForm;
                        pte.concordance      = !string.IsNullOrWhiteSpace(terms [lowerCase].conc) ? terms [lowerCase].conc : "";
                        pte.normMsdSeq       = !string.IsNullOrWhiteSpace(terms [lowerCase].normMsdSeq) ? new List <string> (whitespaceRegex.Split(terms [lowerCase].normMsdSeq)) : new List <string> ();
                        pte.normSeq          = !string.IsNullOrWhiteSpace(terms [lowerCase].normSeq) ? new List <string>(whitespaceRegex.Split(terms[lowerCase].normSeq)):new List <string>();
                        pte.lowercaceForm    = lowerCase;
                        pte.surfaceFormWords = !string.IsNullOrWhiteSpace(surfaceForm) ? new List <string> (whitespaceRegex.Split(surfaceForm)) : new List <string> ();
                        string[] lowerCaseWordArr = !string.IsNullOrWhiteSpace(lowerCase) ? whitespaceRegex.Split(lowerCase) : null;
                        if (lowerCaseWordArr != null)
                        {
                            pte.lowercaseWords.InsertRange(0, lowerCaseWordArr);
                        }
                        if (!string.IsNullOrWhiteSpace(terms[lowerCase].lemmaSeq))
                        {
                            pte.lemmaSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].lemmaSeq));
                        }
                        else
                        {
                            pte.lemmaSeq = new List <string>();
                            for (int i = 0; i < pte.lowercaseWords.Count; i++)
                            {
                                pte.lemmaSeq.Add("");
                            }
                        }
                        if (!string.IsNullOrWhiteSpace(terms[lowerCase].msdSeq))
                        {
                            pte.msdSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].msdSeq));
                        }
                        else
                        {
                            pte.msdSeq = new List <string>();
                            for (int i = 0; i < pte.lowercaseWords.Count; i++)
                            {
                                pte.msdSeq.Add("");
                            }
                        }
                        foreach (string word in lowerCaseWordArr)
                        {
                            pte.len += word.Length;
                            if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word))
                            {
                                lowercasedWordDict.Add(word, 0);
                            }
                            string stem = null;
                            if (stemWords)
                            {
                                stem = LightweightStemmer.Stem(word, lang);
                            }
                            //if (lang !="en")
                            //{
                            pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word));
                            //}
                            //else
                            //{
                            //    pte.simpleTransliteration = pte.lowercaseWords;
                            //}
                            if (srcToTrgDict != null)
                            {
                                List <StringProbabEntry> currList = new List <StringProbabEntry>();
                                if (stemWords)
                                {
                                    if (srcToTrgDict.ContainsKey(stem))
                                    {
                                        foreach (string trgStem in srcToTrgDict[stem].Keys)
                                        {
                                            StringProbabEntry spe = new StringProbabEntry();
                                            spe.str    = trgStem;
                                            spe.probab = srcToTrgDict[stem][trgStem];
                                            currList.Add(spe);
                                        }
                                    }
                                }
                                else
                                {
                                    if (srcToTrgDict.ContainsKey(word))
                                    {
                                        foreach (string trgWord in srcToTrgDict[word].Keys)
                                        {
                                            StringProbabEntry spe = new StringProbabEntry();
                                            spe.str    = trgWord;
                                            spe.probab = srcToTrgDict[word][trgWord];
                                            currList.Add(spe);
                                        }
                                    }
                                }
                                pte.translationList.Add(currList);
                            }
                        }
                        res.Add(lowerCase, pte);
                    }
                }
                Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >();
                //if (threadCount<2)
                //{
                translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount);
                //Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>();
                //CopyTranslits(translitDict,tmp, translitEntry);
                //}
                //else
                //{
                //    translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                //}
                foreach (string lowerCase in res.Keys)
                {
                    foreach (string word in res[lowerCase].lowercaseWords)
                    {
                        if (translitDict.ContainsKey(word))
                        {
                            res[lowerCase].transliterationList.Add(translitDict[word]);
                        }
                        else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word))
                        {
                            res[lowerCase].transliterationList.Add(translitTemp[langKey][word]);
                        }
                        else
                        {
                            res[lowerCase].transliterationList.Add(new List <StringProbabEntry>());
                        }
                    }
                }
                //We add a simple data amount threshold in order not to overflow the memory ...
                if (translitTemp[langKey].Count >= 25000)
                {
                    translitTemp[langKey].Clear();
                    GC.Collect();
                }
            }
            return(res);
        }