Пример #1
0
        public static List <ProcessedTermEntry> ProcessTermsList(List <string> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false)
        {
            List <ProcessedTermEntry> res = new List <ProcessedTermEntry>(1000);
            Dictionary <string, int>  lowercasedWordDict = new Dictionary <string, int>(1000);

            if (terms != null)
            {
                string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang;
                if (!translitTemp.ContainsKey(langKey))
                {
                    translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >());
                }
                foreach (string surfaceForm in terms)
                {
                    string             lowerCase = surfaceForm.ToLower();
                    ProcessedTermEntry pte       = new ProcessedTermEntry();
                    pte.surfaceForm      = surfaceForm;
                    pte.lowercaceForm    = lowerCase;
                    pte.surfaceFormWords = new List <string>(whitespaceRegex.Split(surfaceForm));
                    string[] lowerCaseWordArr = whitespaceRegex.Split(lowerCase);
                    pte.lowercaseWords.InsertRange(0, lowerCaseWordArr);

                    foreach (string word in lowerCaseWordArr)
                    {
                        pte.len += word.Length;
                        if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word))
                        {
                            lowercasedWordDict.Add(word, 0);
                        }
                        string stem = null;
                        if (stemWords)
                        {
                            stem = LightweightStemmer.Stem(word, lang);
                        }
                        //if (lang !="en")
                        //{
                        pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word));
                        //}
                        //else
                        //{
                        //    pte.simpleTransliteration = pte.lowercaseWords;
                        //}
                        if (srcToTrgDict != null)
                        {
                            List <StringProbabEntry> currList = new List <StringProbabEntry>();
                            if (stemWords)
                            {
                                if (srcToTrgDict.ContainsKey(stem))
                                {
                                    foreach (string trgStem in srcToTrgDict[stem].Keys)
                                    {
                                        StringProbabEntry spe = new StringProbabEntry();
                                        spe.str    = trgStem;
                                        spe.probab = srcToTrgDict[stem][trgStem];
                                        currList.Add(spe);
                                    }
                                }
                            }
                            else
                            {
                                if (srcToTrgDict.ContainsKey(word))
                                {
                                    foreach (string trgWord in srcToTrgDict[word].Keys)
                                    {
                                        StringProbabEntry spe = new StringProbabEntry();
                                        spe.str    = trgWord;
                                        spe.probab = srcToTrgDict[word][trgWord];
                                        currList.Add(spe);
                                    }
                                }
                            }
                            pte.translationList.Add(currList);
                        }
                    }
                    res.Add(pte);
                }
                Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >();
                //if (threadCount<2)
                //{
                translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount);
                //This is not nice, however necessary due to the multi-threaded execution - the temp list is not updated in the single-thread scenario
                //    Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>();
                //    CopyTranslits(translitDict,tmp, translitEntry);
                //}
                //else
                //{
                //    translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                //}
                for (int i = 0; i < res.Count; i++)
                {
                    foreach (string word in res[i].lowercaseWords)
                    {
                        if (translitDict.ContainsKey(word))
                        {
                            res[i].transliterationList.Add(translitDict[word]);
                        }
                        else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word))
                        {
                            res[i].transliterationList.Add(translitTemp[langKey][word]);
                        }
                        else
                        {
                            res[i].transliterationList.Add(new List <StringProbabEntry>());
                        }
                    }
                }
                //We add a simple data amount threshold in order not to overflow the memory ...
                if (translitTemp[langKey].Count >= 50000)
                {
                    translitTemp[langKey].Clear();
                    GC.Collect();
                }
            }
            return(res);
        }
Пример #2
0
        private static void CopyTranslits(Dictionary <string, List <StringProbabEntry> > fromDict, Dictionary <string, List <StringProbabEntry> > toDict, MPAlignerConfigurationTranslEntry translEntry)
        {
            string langKey = translEntry != null ? ((translEntry.srcLang != null ? translEntry.srcLang : "") + "_" + (translEntry.trgLang != null ? translEntry.trgLang : "")) : "";

            if (fromDict != null)
            {
                if (toDict == null)
                {
                    toDict = new Dictionary <string, List <StringProbabEntry> >();
                }
                foreach (string term in fromDict.Keys)
                {
                    if (!toDict.ContainsKey(term))
                    {
                        toDict.Add(term, fromDict[term]);
                        if (!translitTemp[langKey].ContainsKey(term))
                        {
                            translitTemp[langKey].Add(term, new List <StringProbabEntry>());
                        }
                        translitTemp[langKey][term] = fromDict[term];
                    }
                }
            }
        }
Пример #3
0
        public static Dictionary <string, List <StringProbabEntry> > GetTransliterations(Dictionary <string, int> lowerCasedTerms, MPAlignerConfigurationTranslEntry translEntry, string mosesPath, string tempFilePath, int threadCount)
        {
            Dictionary <string, List <StringProbabEntry> > res = new Dictionary <string, List <StringProbabEntry> > ();

            if (translEntry == null || lowerCasedTerms == null || lowerCasedTerms.Count < 1 || string.IsNullOrWhiteSpace(mosesPath) || string.IsNullOrWhiteSpace(tempFilePath))
            {
                return(res);
            }
            string langKey = translEntry != null ? ((translEntry.srcLang != null ? translEntry.srcLang : "") + "_" + (translEntry.trgLang != null ? translEntry.trgLang : "")) : "";

            Log.Write("Starting transliteration of " + lowerCasedTerms.Count.ToString() + " tokens.", LogLevelType.LIMITED_OUTPUT);
            int idx = 0;
            List <List <string> > lowerCasedTermDictList = new List <List <string> > (threadCount);

            for (int i = 0; i < threadCount; i++)
            {
                lowerCasedTermDictList.Add(new List <string> ());
            }
            foreach (string term in lowerCasedTerms.Keys)
            {
                lowerCasedTermDictList [idx % threadCount].Add(term);
                idx++;
            }

            string         directory = Path.GetDirectoryName(mosesPath);
            List <Process> processes = new List <Process> ();

            for (int i = 0; i < lowerCasedTermDictList.Count; i++)
            {
                if (lowerCasedTermDictList [i].Count > 0)
                {
                    try {
                        string tmpFile = tempFilePath + i.ToString() + ".tmp";
                        WriteWordsForTransliteration(lowerCasedTermDictList [i], tmpFile);
                        ProcessStartInfo myProcessStartInfo = new ProcessStartInfo(mosesPath);
                        myProcessStartInfo.UseShellExecute        = false;
                        myProcessStartInfo.WorkingDirectory       = directory;
                        myProcessStartInfo.FileName               = mosesPath;
                        myProcessStartInfo.CreateNoWindow         = true;
                        myProcessStartInfo.RedirectStandardOutput = true;
                        myProcessStartInfo.RedirectStandardError  = true;

                        StringBuilder sb = new StringBuilder();
                        sb.Append(" -f ");
                        sb.Append("\"" + translEntry.mosesIniPath + "\" ");
                        sb.Append(" -i ");
                        sb.Append("\"" + tmpFile + "\" ");
                        sb.Append(" -n-best-list ");
                        sb.Append("\"" + tmpFile + ".n_best\" " + translEntry.nBest.ToString());
                        myProcessStartInfo.Arguments = sb.ToString();

                        processes.Add(new Process());
                        processes [processes.Count - 1].StartInfo = myProcessStartInfo;
                        bool started = processes [processes.Count - 1].Start();
                        processes [processes.Count - 1].ErrorDataReceived  += p_ErrorDataReceived;
                        processes [processes.Count - 1].OutputDataReceived += p_OutputDataReceived;
                        processes [processes.Count - 1].BeginOutputReadLine();
                        processes [processes.Count - 1].BeginErrorReadLine();
                    } catch {
                    }
                }
            }
            for (int i = 0; i < processes.Count; i++)
            {
                processes [i].WaitForExit();
                processes [i].Close();
                processes [i].Dispose();
            }
            processes.Clear();

            for (int i = 0; i < lowerCasedTermDictList.Count; i++)
            {
                if (lowerCasedTermDictList[i].Count > 0)
                {
                    string tmpFile = tempFilePath + i.ToString() + ".tmp";
                    if (File.Exists(tmpFile + ".n_best"))
                    {
                        NumberFormatInfo nfi = new NumberFormatInfo();
                        nfi.CurrencyDecimalSeparator = ".";
                        nfi.NumberDecimalSeparator   = ".";
                        nfi.PercentDecimalSeparator  = ".";
                        Dictionary <string, Dictionary <string, bool> > existingTranslits = new Dictionary <string, Dictionary <string, bool> > ();

                        StreamReader sr  = new StreamReader(tmpFile + ".n_best", Encoding.UTF8);
                        string[]     sep = { "|||" };
                        while (!sr.EndOfStream)
                        {
                            string   line    = sr.ReadLine();
                            string[] dataArr = line.Split(sep, StringSplitOptions.RemoveEmptyEntries);
                            if (dataArr.Length == 4)
                            {
                                try {
                                    string idStr = dataArr [0];
                                    idStr = idStr.Trim();
                                    int    id   = Convert.ToInt32(idStr);
                                    string word = dataArr [1];

                                    StringProbabEntry spe = new StringProbabEntry();
                                    spe.str = word.Trim().Replace(" ", "");
                                    string probabStr = dataArr [3];
                                    probabStr  = probabStr.Trim().Replace(',', '.');
                                    spe.probab = Math.Exp(Convert.ToDouble(probabStr, nfi));
                                    if (spe.probab > 1)
                                    {
                                        spe.probab = 1;
                                    }
                                    if (id < lowerCasedTermDictList[i].Count)
                                    {
                                        string term    = lowerCasedTermDictList[i][id];
                                        double min     = Math.Min(spe.str.Length, term.Length);
                                        double max     = Math.Max(spe.str.Length, term.Length);
                                        double lenDiff = min / max;
                                        //Log.Write(term+" "+word+" "+lenDiff.ToString()+" "+spe.probab.ToString(),LogLevelType.ERROR);
                                        if (lenDiff >= translEntry.maxLenDiff)
                                        {
                                            if (!existingTranslits.ContainsKey(term))
                                            {
                                                existingTranslits.Add(term, new Dictionary <string, bool> ());
                                            }

                                            if (!res.ContainsKey(term))
                                            {
                                                res.Add(term, new List <StringProbabEntry> ());
                                            }
                                            if (!translitTemp[langKey].ContainsKey(term))
                                            {
                                                translitTemp[langKey].Add(term, new List <StringProbabEntry>());
                                            }
                                            if (!existingTranslits [term].ContainsKey(spe.str) && spe.probab >= translEntry.threshold)
                                            {
                                                spe.probab = translEntry.translitBf.Get(spe.probab);
                                                existingTranslits [term].Add(spe.str, true);
                                                res [term].Add(spe);
                                                translitTemp[langKey][term].Add(spe);
                                            }
                                        }
                                    }
                                } catch {
                                }
                            }
                        }
                    }
                    try {
                        File.Delete(tmpFile + ".n_best");
                        File.Delete(tmpFile);
                    } catch {
                    }
                }
            }
            GC.Collect();
            GC.WaitForPendingFinalizers();
            return(res);
        }
Пример #4
0
        public static Dictionary <string, ProcessedTermEntry> ProcessTerms(Dictionary <string, SimpleTermEntry> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false)
        {
            Dictionary <string, ProcessedTermEntry> res = new Dictionary <string, ProcessedTermEntry>(1000);
            Dictionary <string, int> lowercasedWordDict = new Dictionary <string, int>(1000);

            if (terms != null)
            {
                Log.Write("Starting pre-processing of " + terms.Count.ToString() + " " + lang + " terms.", LogLevelType.LIMITED_OUTPUT);
                string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang;
                if (!translitTemp.ContainsKey(langKey))
                {
                    translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >());
                }
                foreach (string lowerCase in terms.Keys)
                {
                    string surfaceForm = terms[lowerCase].term;
                    if (!res.ContainsKey(lowerCase)) //TODO: Nothing to do, but be aware that here we allow only the first capitalization of a surface form ... we will ignore other capitalizations.
                    {
                        ProcessedTermEntry pte = new ProcessedTermEntry();
                        pte.surfaceForm      = surfaceForm;
                        pte.concordance      = !string.IsNullOrWhiteSpace(terms [lowerCase].conc) ? terms [lowerCase].conc : "";
                        pte.normMsdSeq       = !string.IsNullOrWhiteSpace(terms [lowerCase].normMsdSeq) ? new List <string> (whitespaceRegex.Split(terms [lowerCase].normMsdSeq)) : new List <string> ();
                        pte.normSeq          = !string.IsNullOrWhiteSpace(terms [lowerCase].normSeq) ? new List <string>(whitespaceRegex.Split(terms[lowerCase].normSeq)):new List <string>();
                        pte.lowercaceForm    = lowerCase;
                        pte.surfaceFormWords = !string.IsNullOrWhiteSpace(surfaceForm) ? new List <string> (whitespaceRegex.Split(surfaceForm)) : new List <string> ();
                        string[] lowerCaseWordArr = !string.IsNullOrWhiteSpace(lowerCase) ? whitespaceRegex.Split(lowerCase) : null;
                        if (lowerCaseWordArr != null)
                        {
                            pte.lowercaseWords.InsertRange(0, lowerCaseWordArr);
                        }
                        if (!string.IsNullOrWhiteSpace(terms[lowerCase].lemmaSeq))
                        {
                            pte.lemmaSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].lemmaSeq));
                        }
                        else
                        {
                            pte.lemmaSeq = new List <string>();
                            for (int i = 0; i < pte.lowercaseWords.Count; i++)
                            {
                                pte.lemmaSeq.Add("");
                            }
                        }
                        if (!string.IsNullOrWhiteSpace(terms[lowerCase].msdSeq))
                        {
                            pte.msdSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].msdSeq));
                        }
                        else
                        {
                            pte.msdSeq = new List <string>();
                            for (int i = 0; i < pte.lowercaseWords.Count; i++)
                            {
                                pte.msdSeq.Add("");
                            }
                        }
                        foreach (string word in lowerCaseWordArr)
                        {
                            pte.len += word.Length;
                            if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word))
                            {
                                lowercasedWordDict.Add(word, 0);
                            }
                            string stem = null;
                            if (stemWords)
                            {
                                stem = LightweightStemmer.Stem(word, lang);
                            }
                            //if (lang !="en")
                            //{
                            pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word));
                            //}
                            //else
                            //{
                            //    pte.simpleTransliteration = pte.lowercaseWords;
                            //}
                            if (srcToTrgDict != null)
                            {
                                List <StringProbabEntry> currList = new List <StringProbabEntry>();
                                if (stemWords)
                                {
                                    if (srcToTrgDict.ContainsKey(stem))
                                    {
                                        foreach (string trgStem in srcToTrgDict[stem].Keys)
                                        {
                                            StringProbabEntry spe = new StringProbabEntry();
                                            spe.str    = trgStem;
                                            spe.probab = srcToTrgDict[stem][trgStem];
                                            currList.Add(spe);
                                        }
                                    }
                                }
                                else
                                {
                                    if (srcToTrgDict.ContainsKey(word))
                                    {
                                        foreach (string trgWord in srcToTrgDict[word].Keys)
                                        {
                                            StringProbabEntry spe = new StringProbabEntry();
                                            spe.str    = trgWord;
                                            spe.probab = srcToTrgDict[word][trgWord];
                                            currList.Add(spe);
                                        }
                                    }
                                }
                                pte.translationList.Add(currList);
                            }
                        }
                        res.Add(lowerCase, pte);
                    }
                }
                Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >();
                //if (threadCount<2)
                //{
                translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount);
                //Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>();
                //CopyTranslits(translitDict,tmp, translitEntry);
                //}
                //else
                //{
                //    translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                //}
                foreach (string lowerCase in res.Keys)
                {
                    foreach (string word in res[lowerCase].lowercaseWords)
                    {
                        if (translitDict.ContainsKey(word))
                        {
                            res[lowerCase].transliterationList.Add(translitDict[word]);
                        }
                        else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word))
                        {
                            res[lowerCase].transliterationList.Add(translitTemp[langKey][word]);
                        }
                        else
                        {
                            res[lowerCase].transliterationList.Add(new List <StringProbabEntry>());
                        }
                    }
                }
                //We add a simple data amount threshold in order not to overflow the memory ...
                if (translitTemp[langKey].Count >= 25000)
                {
                    translitTemp[langKey].Clear();
                    GC.Collect();
                }
            }
            return(res);
        }