Example #1
0
        public static Dictionary<string, ProcessedTermEntry> ProcessTerms(Dictionary<string, SimpleTermEntry> terms, Dictionary<string,Dictionary<string,double>> srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1 , bool stemWords = false)
        {
            Dictionary<string, ProcessedTermEntry> res = new Dictionary<string, ProcessedTermEntry>(1000);
            Dictionary<string, int> lowercasedWordDict = new Dictionary<string, int>(1000);
            if (terms!=null)
            {
                Log.Write ("Starting pre-processing of "+terms.Count.ToString()+" "+ lang +" terms.",LogLevelType.LIMITED_OUTPUT);
                string langKey = translitEntry!=null?((translitEntry.srcLang!=null?translitEntry.srcLang:"")+"_"+(translitEntry.trgLang!=null?translitEntry.trgLang:"")):lang;
                if (!translitTemp.ContainsKey(langKey)) translitTemp.Add(langKey,new Dictionary<string, List<StringProbabEntry>>());
                foreach(string lowerCase in terms.Keys)
                {
                    string surfaceForm = terms[lowerCase].term;
                    if (!res.ContainsKey(lowerCase)) //TODO: Nothing to do, but be aware that here we allow only the first capitalization of a surface form ... we will ignore other capitalizations.
                    {
                        ProcessedTermEntry pte = new ProcessedTermEntry();
                        pte.surfaceForm = surfaceForm;
                        pte.concordance = !string.IsNullOrWhiteSpace (terms [lowerCase].conc) ? terms [lowerCase].conc : "";
                        pte.normMsdSeq = !string.IsNullOrWhiteSpace (terms [lowerCase].normMsdSeq) ? new List<string> (whitespaceRegex.Split (terms [lowerCase].normMsdSeq)) : new List<string> ();
                        pte.normSeq = !string.IsNullOrWhiteSpace (terms [lowerCase].normSeq) ? new List<string>(whitespaceRegex.Split(terms[lowerCase].normSeq)):new List<string>();
                        pte.lowercaceForm = lowerCase;
                        pte.surfaceFormWords = !string.IsNullOrWhiteSpace (surfaceForm) ? new List<string> (whitespaceRegex.Split (surfaceForm)) : new List<string> ();
                        string[] lowerCaseWordArr = !string.IsNullOrWhiteSpace (lowerCase) ? whitespaceRegex.Split (lowerCase) : null;
                        if (lowerCaseWordArr!=null)
                        {
                            pte.lowercaseWords.InsertRange(0,lowerCaseWordArr);
                        }
                        if (!string.IsNullOrWhiteSpace(terms[lowerCase].lemmaSeq))
                        {
                            pte.lemmaSeq = new List<string>(whitespaceRegex.Split(terms[lowerCase].lemmaSeq));
                        }
                        else
                        {
                            pte.lemmaSeq = new List<string>();
                            for (int i=0;i<pte.lowercaseWords.Count;i++){pte.lemmaSeq.Add("");}
                        }
                        if (!string.IsNullOrWhiteSpace(terms[lowerCase].msdSeq))
                        {
                            pte.msdSeq = new List<string>(whitespaceRegex.Split(terms[lowerCase].msdSeq));
                        }
                        else
                        {
                            pte.msdSeq = new List<string>();
                            for (int i=0;i<pte.lowercaseWords.Count;i++){pte.msdSeq.Add("");}
                        }
                        foreach(string word in lowerCaseWordArr)
                        {
                            pte.len+=word.Length;
                            if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word))
                            {
                                lowercasedWordDict.Add(word,0);
                            }
                            string stem = null;
                            if (stemWords)
                            {
                                stem = LightweightStemmer.Stem(word,lang);
                            }
                            //if (lang !="en")
                            //{
                                pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word));
                            //}
                            //else
                            //{
                            //    pte.simpleTransliteration = pte.lowercaseWords;
                            //}
                            if (srcToTrgDict!=null)
                            {
                                List<StringProbabEntry> currList = new List<StringProbabEntry>();
                                if (stemWords)
                                {
                                    if (srcToTrgDict.ContainsKey(stem))
                                    {
                                        foreach(string trgStem in srcToTrgDict[stem].Keys)
                                        {
                                            StringProbabEntry spe = new StringProbabEntry();
                                            spe.str=trgStem;
                                            spe.probab = srcToTrgDict[stem][trgStem];
                                            currList.Add(spe);
                                        }
                                    }
                                }
                                else
                                {
                                    if (srcToTrgDict.ContainsKey(word))
                                    {
                                        foreach(string trgWord in srcToTrgDict[word].Keys)
                                        {
                                            StringProbabEntry spe = new StringProbabEntry();
                                            spe.str=trgWord;
                                            spe.probab = srcToTrgDict[word][trgWord];
                                            currList.Add(spe);
                                        }
                                    }
                                }
                                pte.translationList.Add(currList);
                            }

                        }
                        res.Add(lowerCase,pte);
                    }
                }
                Dictionary<string, List<StringProbabEntry>> translitDict = new Dictionary<string, List<StringProbabEntry>>();
                //if (threadCount<2)
                //{
                    translitDict = GetTransliterations(lowercasedWordDict,translitEntry, mosesPath, tempFilePath,threadCount);
                    //Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>();
                    //CopyTranslits(translitDict,tmp, translitEntry);
                //}
                //else
                //{
                //    translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                //}
                foreach (string lowerCase in res.Keys)
                {
                    foreach(string word in res[lowerCase].lowercaseWords)
                    {
                        if (translitDict.ContainsKey(word))
                        {
                            res[lowerCase].transliterationList.Add(translitDict[word]);
                        }
                        else if (translitTemp.ContainsKey(langKey)&&translitTemp[langKey].ContainsKey(word))
                        {
                            res[lowerCase].transliterationList.Add(translitTemp[langKey][word]);
                        }
                        else
                        {
                            res[lowerCase].transliterationList.Add(new List<StringProbabEntry>());
                        }
                    }
                }
                //We add a simple data amount threshold in order not to overflow the memory ...
                if (translitTemp[langKey].Count>=25000)
                {
                    translitTemp[langKey].Clear();
                    GC.Collect();
                }
            }
            return res;
        }
Example #2
0
        public static List<ProcessedTermEntry> ProcessTermsList(List<string> terms, Dictionary<string, Dictionary<string, double>> srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1 , bool stemWords = false)
        {
            List<ProcessedTermEntry> res = new List<ProcessedTermEntry>(1000);
            Dictionary<string, int> lowercasedWordDict = new Dictionary<string, int>(1000);
            if (terms!=null)
            {
                string langKey = translitEntry!=null?((translitEntry.srcLang!=null?translitEntry.srcLang:"")+"_"+(translitEntry.trgLang!=null?translitEntry.trgLang:"")):lang;
                if (!translitTemp.ContainsKey(langKey)) translitTemp.Add(langKey,new Dictionary<string, List<StringProbabEntry>>());
                foreach(string surfaceForm in terms)
                {
                    string lowerCase = surfaceForm.ToLower();
                    ProcessedTermEntry pte = new ProcessedTermEntry();
                    pte.surfaceForm = surfaceForm;
                    pte.lowercaceForm = lowerCase;
                    pte.surfaceFormWords = new List<string>(whitespaceRegex.Split(surfaceForm));
                    string[] lowerCaseWordArr = whitespaceRegex.Split(lowerCase);
                    pte.lowercaseWords.InsertRange(0,lowerCaseWordArr);

                    foreach(string word in lowerCaseWordArr)
                    {
                        pte.len+=word.Length;
                        if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word))
                        {
                            lowercasedWordDict.Add(word,0);
                        }
                        string stem = null;
                        if (stemWords)
                        {
                            stem = LightweightStemmer.Stem(word,lang);
                        }
                        //if (lang !="en")
                        //{
                        pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word));
                        //}
                        //else
                        //{
                        //    pte.simpleTransliteration = pte.lowercaseWords;
                        //}
                        if (srcToTrgDict!=null)
                        {
                            List<StringProbabEntry> currList = new List<StringProbabEntry>();
                            if (stemWords)
                            {
                                if (srcToTrgDict.ContainsKey(stem))
                                {
                                    foreach(string trgStem in srcToTrgDict[stem].Keys)
                                    {
                                        StringProbabEntry spe = new StringProbabEntry();
                                        spe.str=trgStem;
                                        spe.probab = srcToTrgDict[stem][trgStem];
                                        currList.Add(spe);
                                    }
                                }
                            }
                            else
                            {
                                if (srcToTrgDict.ContainsKey(word))
                                {
                                    foreach(string trgWord in srcToTrgDict[word].Keys)
                                    {
                                        StringProbabEntry spe = new StringProbabEntry();
                                        spe.str=trgWord;
                                        spe.probab = srcToTrgDict[word][trgWord];
                                        currList.Add(spe);
                                    }
                                }
                            }
                            pte.translationList.Add(currList);
                        }
                    }
                    res.Add(pte);
                }
                Dictionary<string, List<StringProbabEntry>> translitDict = new Dictionary<string, List<StringProbabEntry>>();
                //if (threadCount<2)
                //{
                    translitDict = GetTransliterations(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                    //This is not nice, however necessary due to the multi-threaded execution - the temp list is not updated in the single-thread scenario
                //    Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>();
                //    CopyTranslits(translitDict,tmp, translitEntry);
                //}
                //else
                //{
                //    translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                //}
                for(int i=0; i<res.Count;i++)
                {
                    foreach(string word in res[i].lowercaseWords)
                    {
                        if (translitDict.ContainsKey(word))
                        {
                            res[i].transliterationList.Add(translitDict[word]);
                        }
                        else if (translitTemp.ContainsKey(langKey)&&translitTemp[langKey].ContainsKey(word))
                        {
                            res[i].transliterationList.Add(translitTemp[langKey][word]);
                        }
                        else
                        {
                            res[i].transliterationList.Add(new List<StringProbabEntry>());
                        }
                    }
                }
                //We add a simple data amount threshold in order not to overflow the memory ...
                if (translitTemp[langKey].Count>=50000)
                {
                    translitTemp[langKey].Clear();
                    GC.Collect();
                }
            }
            return res;
        }
Example #3
0
        public static List <ProcessedTermEntry> ProcessTermsList(List <string> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false)
        {
            List <ProcessedTermEntry> res = new List <ProcessedTermEntry>(1000);
            Dictionary <string, int>  lowercasedWordDict = new Dictionary <string, int>(1000);

            if (terms != null)
            {
                string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang;
                if (!translitTemp.ContainsKey(langKey))
                {
                    translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >());
                }
                foreach (string surfaceForm in terms)
                {
                    string             lowerCase = surfaceForm.ToLower();
                    ProcessedTermEntry pte       = new ProcessedTermEntry();
                    pte.surfaceForm      = surfaceForm;
                    pte.lowercaceForm    = lowerCase;
                    pte.surfaceFormWords = new List <string>(whitespaceRegex.Split(surfaceForm));
                    string[] lowerCaseWordArr = whitespaceRegex.Split(lowerCase);
                    pte.lowercaseWords.InsertRange(0, lowerCaseWordArr);

                    foreach (string word in lowerCaseWordArr)
                    {
                        pte.len += word.Length;
                        if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word))
                        {
                            lowercasedWordDict.Add(word, 0);
                        }
                        string stem = null;
                        if (stemWords)
                        {
                            stem = LightweightStemmer.Stem(word, lang);
                        }
                        //if (lang !="en")
                        //{
                        pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word));
                        //}
                        //else
                        //{
                        //    pte.simpleTransliteration = pte.lowercaseWords;
                        //}
                        if (srcToTrgDict != null)
                        {
                            List <StringProbabEntry> currList = new List <StringProbabEntry>();
                            if (stemWords)
                            {
                                if (srcToTrgDict.ContainsKey(stem))
                                {
                                    foreach (string trgStem in srcToTrgDict[stem].Keys)
                                    {
                                        StringProbabEntry spe = new StringProbabEntry();
                                        spe.str    = trgStem;
                                        spe.probab = srcToTrgDict[stem][trgStem];
                                        currList.Add(spe);
                                    }
                                }
                            }
                            else
                            {
                                if (srcToTrgDict.ContainsKey(word))
                                {
                                    foreach (string trgWord in srcToTrgDict[word].Keys)
                                    {
                                        StringProbabEntry spe = new StringProbabEntry();
                                        spe.str    = trgWord;
                                        spe.probab = srcToTrgDict[word][trgWord];
                                        currList.Add(spe);
                                    }
                                }
                            }
                            pte.translationList.Add(currList);
                        }
                    }
                    res.Add(pte);
                }
                Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >();
                //if (threadCount<2)
                //{
                translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount);
                //This is not nice, however necessary due to the multi-threaded execution - the temp list is not updated in the single-thread scenario
                //    Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>();
                //    CopyTranslits(translitDict,tmp, translitEntry);
                //}
                //else
                //{
                //    translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                //}
                for (int i = 0; i < res.Count; i++)
                {
                    foreach (string word in res[i].lowercaseWords)
                    {
                        if (translitDict.ContainsKey(word))
                        {
                            res[i].transliterationList.Add(translitDict[word]);
                        }
                        else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word))
                        {
                            res[i].transliterationList.Add(translitTemp[langKey][word]);
                        }
                        else
                        {
                            res[i].transliterationList.Add(new List <StringProbabEntry>());
                        }
                    }
                }
                //We add a simple data amount threshold in order not to overflow the memory ...
                if (translitTemp[langKey].Count >= 50000)
                {
                    translitTemp[langKey].Clear();
                    GC.Collect();
                }
            }
            return(res);
        }
Example #4
0
        public static Dictionary <string, ProcessedTermEntry> ProcessTerms(Dictionary <string, SimpleTermEntry> terms, Dictionary <string, Dictionary <string, double> > srcToTrgDict, string lang, MPAlignerConfigurationTranslEntry translitEntry, string mosesPath, string tempFilePath, int threadCount = 1, bool stemWords = false)
        {
            Dictionary <string, ProcessedTermEntry> res = new Dictionary <string, ProcessedTermEntry>(1000);
            Dictionary <string, int> lowercasedWordDict = new Dictionary <string, int>(1000);

            if (terms != null)
            {
                Log.Write("Starting pre-processing of " + terms.Count.ToString() + " " + lang + " terms.", LogLevelType.LIMITED_OUTPUT);
                string langKey = translitEntry != null?((translitEntry.srcLang != null?translitEntry.srcLang:"") + "_" + (translitEntry.trgLang != null?translitEntry.trgLang:"")):lang;
                if (!translitTemp.ContainsKey(langKey))
                {
                    translitTemp.Add(langKey, new Dictionary <string, List <StringProbabEntry> >());
                }
                foreach (string lowerCase in terms.Keys)
                {
                    string surfaceForm = terms[lowerCase].term;
                    if (!res.ContainsKey(lowerCase)) //TODO: Nothing to do, but be aware that here we allow only the first capitalization of a surface form ... we will ignore other capitalizations.
                    {
                        ProcessedTermEntry pte = new ProcessedTermEntry();
                        pte.surfaceForm      = surfaceForm;
                        pte.concordance      = !string.IsNullOrWhiteSpace(terms [lowerCase].conc) ? terms [lowerCase].conc : "";
                        pte.normMsdSeq       = !string.IsNullOrWhiteSpace(terms [lowerCase].normMsdSeq) ? new List <string> (whitespaceRegex.Split(terms [lowerCase].normMsdSeq)) : new List <string> ();
                        pte.normSeq          = !string.IsNullOrWhiteSpace(terms [lowerCase].normSeq) ? new List <string>(whitespaceRegex.Split(terms[lowerCase].normSeq)):new List <string>();
                        pte.lowercaceForm    = lowerCase;
                        pte.surfaceFormWords = !string.IsNullOrWhiteSpace(surfaceForm) ? new List <string> (whitespaceRegex.Split(surfaceForm)) : new List <string> ();
                        string[] lowerCaseWordArr = !string.IsNullOrWhiteSpace(lowerCase) ? whitespaceRegex.Split(lowerCase) : null;
                        if (lowerCaseWordArr != null)
                        {
                            pte.lowercaseWords.InsertRange(0, lowerCaseWordArr);
                        }
                        if (!string.IsNullOrWhiteSpace(terms[lowerCase].lemmaSeq))
                        {
                            pte.lemmaSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].lemmaSeq));
                        }
                        else
                        {
                            pte.lemmaSeq = new List <string>();
                            for (int i = 0; i < pte.lowercaseWords.Count; i++)
                            {
                                pte.lemmaSeq.Add("");
                            }
                        }
                        if (!string.IsNullOrWhiteSpace(terms[lowerCase].msdSeq))
                        {
                            pte.msdSeq = new List <string>(whitespaceRegex.Split(terms[lowerCase].msdSeq));
                        }
                        else
                        {
                            pte.msdSeq = new List <string>();
                            for (int i = 0; i < pte.lowercaseWords.Count; i++)
                            {
                                pte.msdSeq.Add("");
                            }
                        }
                        foreach (string word in lowerCaseWordArr)
                        {
                            pte.len += word.Length;
                            if (!lowercasedWordDict.ContainsKey(word) && !translitTemp[langKey].ContainsKey(word))
                            {
                                lowercasedWordDict.Add(word, 0);
                            }
                            string stem = null;
                            if (stemWords)
                            {
                                stem = LightweightStemmer.Stem(word, lang);
                            }
                            //if (lang !="en")
                            //{
                            pte.simpleTransliteration.Add(SimpleCharacterTransliteration.Transliterate(word));
                            //}
                            //else
                            //{
                            //    pte.simpleTransliteration = pte.lowercaseWords;
                            //}
                            if (srcToTrgDict != null)
                            {
                                List <StringProbabEntry> currList = new List <StringProbabEntry>();
                                if (stemWords)
                                {
                                    if (srcToTrgDict.ContainsKey(stem))
                                    {
                                        foreach (string trgStem in srcToTrgDict[stem].Keys)
                                        {
                                            StringProbabEntry spe = new StringProbabEntry();
                                            spe.str    = trgStem;
                                            spe.probab = srcToTrgDict[stem][trgStem];
                                            currList.Add(spe);
                                        }
                                    }
                                }
                                else
                                {
                                    if (srcToTrgDict.ContainsKey(word))
                                    {
                                        foreach (string trgWord in srcToTrgDict[word].Keys)
                                        {
                                            StringProbabEntry spe = new StringProbabEntry();
                                            spe.str    = trgWord;
                                            spe.probab = srcToTrgDict[word][trgWord];
                                            currList.Add(spe);
                                        }
                                    }
                                }
                                pte.translationList.Add(currList);
                            }
                        }
                        res.Add(lowerCase, pte);
                    }
                }
                Dictionary <string, List <StringProbabEntry> > translitDict = new Dictionary <string, List <StringProbabEntry> >();
                //if (threadCount<2)
                //{
                translitDict = GetTransliterations(lowercasedWordDict, translitEntry, mosesPath, tempFilePath, threadCount);
                //Dictionary<string, List<StringProbabEntry>> tmp = new Dictionary<string, List<StringProbabEntry>>();
                //CopyTranslits(translitDict,tmp, translitEntry);
                //}
                //else
                //{
                //    translitDict = GetTransliterationsMultiThreaded(lowercasedWordDict,translitEntry, mosesPath, tempFilePath, threadCount);
                //}
                foreach (string lowerCase in res.Keys)
                {
                    foreach (string word in res[lowerCase].lowercaseWords)
                    {
                        if (translitDict.ContainsKey(word))
                        {
                            res[lowerCase].transliterationList.Add(translitDict[word]);
                        }
                        else if (translitTemp.ContainsKey(langKey) && translitTemp[langKey].ContainsKey(word))
                        {
                            res[lowerCase].transliterationList.Add(translitTemp[langKey][word]);
                        }
                        else
                        {
                            res[lowerCase].transliterationList.Add(new List <StringProbabEntry>());
                        }
                    }
                }
                //We add a simple data amount threshold in order not to overflow the memory ...
                if (translitTemp[langKey].Count >= 25000)
                {
                    translitTemp[langKey].Clear();
                    GC.Collect();
                }
            }
            return(res);
        }
Example #5
0
        public static AlignmentInfoElement AlignSingleTermPair(ProcessedTermEntry srcPte, ProcessedTermEntry trgPte)
        {
            if (srcPte!=null && trgPte!=null)
            {
                AlignmentInfoElement aie = new AlignmentInfoElement();
                List<WordAlignmentElement> srcToTrg = new List<WordAlignmentElement>();
                List<WordAlignmentElement> trgToSrc = new List<WordAlignmentElement>();
                maxStrLen = 0;

                if (_interlinguaDictUsed && _interlinguaTranslitUsed)
                {

                    ///Types:
                    /// 0 - dictionary,
                    /// 1 - simple translit,
                    /// 2 - target,
                    /// 3 - translit

                    //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLATION
                    AlignStringProbabEntryListLists (_lpeConf, srcPte.translationList, trgPte.translationList, srcToTrg, trgToSrc, 0, 0);

                    //Translation is in EN language; SOURCE TRANSLATION vs TARGET SIMPLE TRANSLITERATION
                    AlignStringProbabEntryListToStringList (_lpeConf, srcPte.translationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 0, 1);

                    //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLITERATION
                    AlignStringProbabEntryListLists (_lpeConf, srcPte.translationList, trgPte.transliterationList, srcToTrg, trgToSrc, 0, 3);

                    //Translation is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLATION
                    AlignStringListToStringProbabEntryList (_lpeConf, srcPte.simpleTransliteration, trgPte.translationList, srcToTrg, trgToSrc, 1, 0);

                    //Translation is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLATION
                    AlignStringProbabEntryListLists (_lpeConf, srcPte.transliterationList, trgPte.translationList, srcToTrg, trgToSrc, 3, 0);

                    //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION
                    AlignStringProbabEntryListToStringList (_lpeConf, srcPte.transliterationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 3, 1);

                    //Transliteration is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLITERATION
                    AlignStringListToStringProbabEntryList (_lpeConf, srcPte.simpleTransliteration, trgPte.transliterationList, srcToTrg, trgToSrc, 1, 3);

                    //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLITERATION
                    AlignStringProbabEntryListLists (_lpeConf, srcPte.transliterationList, trgPte.transliterationList, srcToTrg, trgToSrc, 3, 3);

                    //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION
                    AlignStringLists (_lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1);

                }
                else if (_interlinguaTranslitUsed)
                {
                    //Translation is in target language; SOURCE TRANSLATION vs TARGET
                    AlignStringProbabEntryListToStringList (_lpeConf, srcPte.translationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 0, 2);

                    //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION
                    AlignStringProbabEntryListToStringList (_lpeConf, srcPte.transliterationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 3, 1);

                    //Transliteration is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLITERATION
                    AlignStringListToStringProbabEntryList (_lpeConf, srcPte.simpleTransliteration, trgPte.transliterationList, srcToTrg, trgToSrc, 1, 3);

                    //Transliteration is in EN language; SOURCE TRANSLITERATION vs TARGET TRANSLITERATION
                    AlignStringProbabEntryListLists (_lpeConf, srcPte.transliterationList, trgPte.transliterationList, srcToTrg, trgToSrc, 3, 2);

                    //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION
                    AlignStringLists (_lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1);

                    //Translation is in target language; SOURCE vs TARGET TRANSLATION
                    AlignStringListToStringProbabEntryList (_lpeConf, srcPte.lowercaseWords, trgPte.translationList, srcToTrg, trgToSrc, 2, 0);
                }
                else if (_interlinguaDictUsed)
                {
                    //Translation is in EN language; SOURCE TRANSLATION vs TARGET TRANSLATION
                    AlignStringProbabEntryListLists (_lpeConf, srcPte.translationList, trgPte.translationList, srcToTrg, trgToSrc, 0, 0);

                    //Translation is in EN language; SOURCE TRANSLATION vs TARGET SIMPLE TRANSLITERATION
                    AlignStringProbabEntryListToStringList (_lpeConf, srcPte.translationList, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 0, 1);

                    //Translation is in EN language; SOURCE SIMPLE TRANSLITERATION vs TARGET TRANSLATION
                    AlignStringListToStringProbabEntryList (_lpeConf, srcPte.simpleTransliteration, trgPte.translationList, srcToTrg, trgToSrc, 1, 0);

                    //Transliteration is in target language; SOURCE TRANSLITERATION vs TARGET
                    AlignStringProbabEntryListToStringList (_lpeConf, srcPte.transliterationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 3, 2);

                    //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION
                    AlignStringLists (_lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1);

                    //Transliteration is in target language; SOURCE vs TARGET TRANSLITERATION
                    AlignStringListToStringProbabEntryList (_lpeConf, srcPte.lowercaseWords, trgPte.transliterationList, srcToTrg, trgToSrc, 2, 3);

                }
                else
                {
                    //Translation is in target language; SOURCE TRANSLATION vs TARGET
                    AlignStringProbabEntryListToStringList (_lpeConf, srcPte.translationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 0, 2);

                    //Transliteration is in target language; SOURCE TRANSLITERATION vs TARGET
                    AlignStringProbabEntryListToStringList (_lpeConf, srcPte.transliterationList, trgPte.lowercaseWords, srcToTrg, trgToSrc, 3, 2);

                    //Translation is in target language; SOURCE vs TARGET TRANSLATION
                    AlignStringListToStringProbabEntryList (_lpeConf, srcPte.lowercaseWords, trgPte.translationList, srcToTrg, trgToSrc, 2, 0);

                    //Transliteration is in target language; SOURCE vs TARGET TRANSLITERATION
                    AlignStringListToStringProbabEntryList (_lpeConf, srcPte.lowercaseWords, trgPte.transliterationList, srcToTrg, trgToSrc, 2, 3);

                    //Simple translit of both is in EN; SOURCE SIMPLE TRANSLITERATION vs TARGET SIMPLE TRANSLITERATION
                    AlignStringLists (_lpeConf, srcPte.simpleTransliteration, trgPte.simpleTransliteration, srcToTrg, trgToSrc, 1, 1);

                }
                aie.srcToTrgAlignments = srcToTrg;
                aie.trgToSrcAlignments = trgToSrc;

                aie.srcEntry = srcPte;
                aie.trgEntry = trgPte;

                ConsolidateOverlaps(_lpeConf,aie, _excDict);
                if(CreateStrListsForEval(_configuration,aie,_srcStopWords,_trgStopWords))
                {
                    aie.alignmentScore = EvaluateAlignmentScore(_lpeConf,aie);
                    if (aie.alignmentScore>=_lpeConf.finalAlignmentThr)
                    {
                        //If you wish to debug the process, comment the lines below that clear the alignments...
                        aie.srcToTrgAlignments.Clear();
                        aie.trgToSrcAlignments.Clear();
                        aie.consolidatedAlignment.Clear();
                        aie.srcFile = _srcFile;
                        aie.trgFile = _trgFile;
                        return aie;
                    }
                }
            }
            return null;
        }
Example #6
0
 public static double GetProbab(ProcessedTermEntry pte,int id, short type, int typeId)
 {
     ///Types:
     /// 0 - dictionary,
     /// 1 - simple translit,
     /// 2 - target,
     /// 3 - translit
     if (type==0)
     {
         return pte.translationList[id][typeId].probab;
     }
     else if (type == 1)
     {
         return 1;
     }
     else if (type == 2)
     {
         return 1;
     }
     else if (type == 3)
     {
         return pte.transliterationList[id][typeId].probab;
     }
     return 1;
 }
Example #7
0
 public static string GetCorrectString(ProcessedTermEntry pte,int id, short type, int typeId)
 {
     ///Types:
     /// 0 - dictionary,
     /// 1 - simple translit,
     /// 2 - target,
     /// 3 - translit
     if (type==0)
     {
         return pte.translationList[id][typeId].str;
     }
     else if (type == 1)
     {
         return pte.simpleTransliteration[id];
     }
     else if (type == 2)
     {
         return pte.lowercaseWords[id];
     }
     else if (type == 3)
     {
         return pte.transliterationList[id][typeId].str;
     }
     return null;
 }