private IEnumerable <IMarcellEntity> PerformSearch(TEntityType sourceEntity, ParametrizedSearchQuery query, int resultCount)
 {
     if (typeof(TEntityType) == typeof(Paragraph))
     {
         return(m_searcher.PerformSearch <Paragraph>(query, resultCount, 1).ResultList);
     }
     else if (typeof(TEntityType) == typeof(Sentence))
     {
         return(m_searcher.PerformSearch <Sentence>(query, resultCount, 1).ResultList);
     }
     throw new ArgumentException("Unsupported type provided!");
 }
        private ParametrizedSearchQuery GetQueryFromEntity(TEntityType sourceEntity, string language)
        {
            ParametrizedSearchQuery alignQuery = new ParametrizedSearchQuery
            {
                Language = language,
            };

            if (typeof(TEntityType) == typeof(Document))
            {
                var sourceDocument = sourceEntity as Document;
                alignQuery.DocumentTokens = sourceDocument.DocumentSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.DocumentTopics = sourceDocument.DocumentSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.SearchIn       = IndexObjectType.DocumentIndex;
            }
            else if (typeof(TEntityType) == typeof(Section))
            {
                var sourceSection = sourceEntity as Section;
                alignQuery.DocumentTokens = sourceSection.DocumentSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.DocumentTopics = sourceSection.DocumentSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.SectionTokens  = sourceSection.SectionSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.SectionTopics  = sourceSection.SectionSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.SearchIn       = IndexObjectType.DocumentIndex | IndexObjectType.SectionIndex;
            }
            else if (typeof(TEntityType) == typeof(Paragraph))
            {
                var sourceParagraph = sourceEntity as Paragraph;
                alignQuery.DocumentTokens  = sourceParagraph.DocumentSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.DocumentTopics  = sourceParagraph.DocumentSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.SectionTokens   = sourceParagraph.SectionSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.SectionTopics   = sourceParagraph.SectionSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.ParagraphTokens = sourceParagraph.ParagraphSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.ParagraphTopics = sourceParagraph.ParagraphSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.SearchIn        = IndexObjectType.DocumentIndex | IndexObjectType.SectionIndex | IndexObjectType.ParagraphIndex;
            }
            else if (typeof(TEntityType) == typeof(Sentence))
            {
                var sourceSentence = sourceEntity as Sentence;
                alignQuery.DocumentTokens  = sourceSentence.DocumentSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.DocumentTopics  = sourceSentence.DocumentSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.SectionTokens   = sourceSentence.SectionSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.SectionTopics   = sourceSentence.SectionSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.ParagraphTokens = sourceSentence.ParagraphSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.ParagraphTopics = sourceSentence.ParagraphSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.SentenceTokens  = sourceSentence.SentenceSimilarityData.ConsolidatedTokens.ToArray();
                alignQuery.SentenceTopics  = sourceSentence.SentenceSimilarityData.ConsolidatedTopics.ToArray();
                alignQuery.SearchIn        = IndexObjectType.DocumentIndex | IndexObjectType.SectionIndex | IndexObjectType.ParagraphIndex | IndexObjectType.SentenceIndex;
            }

            return(alignQuery);
        }
        protected virtual double PerformRoundTrip(TEntityType genericSourceEntity, string language)
        {
            /* The logic is as follows:
             *  - the initial quality metric is whether the parameters are set in a way that allow finding the original paragraph in the original language; this is abasic sanity check to make sure we are losing as little information in the translation as possible
             *  - we then perform n language 1 -> lanugage 2 alignements based on tokens & parameters
             *  - we go through each of them and perform n translations back from language 2 -> language 1
             *  - if we detertmine that one of the lang 1 -> lang 2 -> lang 1 translations is the original paragraph, the penalty is calculated based on the position of the result (higher the better) and the similarity between the orginal, translated and back-translated paragraphs
             *  - if not, the penalty is determined based on the similarity between translated and final paragraphs
             */

            const int scannedDocuments = 10;
            //try
            {
                //First, try to locate the paragraph by using the Marcell metadata within the same language. This should always return the correct paragraph in cases,where the recognition qualit is high enough
                var sourceEntity = genericSourceEntity as TEntityType;

                ParametrizedSearchQuery selfAlignQuery = GetQueryFromEntity(sourceEntity, sourceEntity.Language);

                var selfTranslatedParagraphs = PerformSearch(sourceEntity, selfAlignQuery, 1);
                if (selfTranslatedParagraphs.Count() == 0)
                {
                    //When searching for the randomly selected paragraph by using its own set of keywords in the original language, no match was found -> maximum penalty should be applied
                    return(10 * m_failedNoMatchPenalty);
                }
                else
                {
                    if (selfTranslatedParagraphs.First().InternalId != sourceEntity.InternalId)
                    {
                        //We located a different paragraphthan the one we were searching for in the original language -> apply a high penalty for that
                        return(5 * m_failedFalseMatchPenalty);
                    }
                }

                ParametrizedSearchQuery alignQuery = GetQueryFromEntity(sourceEntity, language);

                var translatedParagraphs = PerformSearch(sourceEntity, alignQuery, scannedDocuments);

                if (translatedParagraphs.Count() == 0)
                {
                    return(2 * m_failedNoMatchPenalty);
                }

                TEntityType firstMatchFinal = null;
                for (int i = 0; i < translatedParagraphs.Count(); i++)
                {
                    var translatedParagraph = translatedParagraphs.ElementAt(i) as TEntityType;
                    ParametrizedSearchQuery alignBackQuery = GetQueryFromEntity(translatedParagraph, sourceEntity.Language);
                    var finalParagraph = PerformSearch(translatedParagraph, alignBackQuery, scannedDocuments);

                    if (finalParagraph.Count() == 0)
                    {
                        return(m_failedNoMatchPenalty);
                    }

                    if (i == 0)
                    {
                        //We are within the first match for the translated paragraph, we select it
                        firstMatchFinal = finalParagraph.First() as TEntityType;
                    }

                    //Check if one of the top n matches is the one we want
                    for (int j = 0; j < finalParagraph.Count(); j++)
                    {
                        if (finalParagraph.ElementAt(j).InternalId == sourceEntity.InternalId)
                        {
                            //It is - the penalty is based on the search result position;
                            //the higher the result, the lower the penalty
                            double penaltyFirst  = (Math.Log(i + 1) / Math.Log(translatedParagraphs.Count()));
                            double penaltySecond = (Math.Log(j + 1) / Math.Log(finalParagraph.Count()));

                            if (i == 0 && j == 0)
                            {
                                //for the correct match just short-circuit and return 0 to avoid lengthy unnecessary calculation
                                return(0);
                            }
                            else
                            {
                                return((penaltyFirst * 5 + penaltySecond) * m_failedFalseMatchPenalty / 2                      //Initial penalty calculated based on the position of the correct result after round trip
                                       + CalculateTokenSimilarity(sourceEntity, translatedParagraphs.First() as TEntityType) + //Additional penalty calculated based on the similarity of the first match in translated and round-trip. THe more similar to the original they are, the better
                                       CalculateTokenSimilarity(sourceEntity, firstMatchFinal) / 2);
                            }
                        }
                    }
                }

                //It isn't. We now calculate the penalty, based on the base of the no match penalty with the addition of the number of paragraph tokens not matching after round trip
                //We add the maximum possible penalty in case a direct match is found, which is 4x the false match penalty
                return(7 * m_failedFalseMatchPenalty + CalculateTokenSimilarity(sourceEntity, translatedParagraphs.First() as TEntityType) + CalculateTokenSimilarity(sourceEntity, firstMatchFinal) / 2);
            }
        }
Ejemplo n.º 4
0
        public T FindTranslation <T>(Guid sourceId, string sourceLanguage, string targetLanguage, ParametrizedSearchParameters searchParameters) where T : class, IMarcellEntity
        {
            //TODO: Update to sent parameters
            OptimizedParametrizedSearch searchProvider;

            if (searchParameters == null || !searchParameters.IsSet())
            {
                searchProvider = m_searcherCache[defaultSearchParameters];
            }
            else
            {
                searchProvider = GetSearcher(searchParameters);
            }

            ParametrizedSearchQuery alignQuery;

            if (typeof(T) == typeof(Sentence))
            {
                var sourceSentence = searchProvider.GetSentence(sourceLanguage, sourceId);
                if (sourceSentence == null)
                {
                    return(null);
                }

                alignQuery = new ParametrizedSearchQuery
                {
                    Language        = targetLanguage,
                    DocumentTokens  = sourceSentence.DocumentSimilarityData.ConsolidatedTokens.ToArray(),
                    DocumentTopics  = sourceSentence.DocumentSimilarityData.ConsolidatedTopics.ToArray(),
                    SectionTokens   = sourceSentence.SectionSimilarityData.ConsolidatedTokens.ToArray(),
                    SectionTopics   = sourceSentence.SectionSimilarityData.ConsolidatedTopics.ToArray(),
                    ParagraphTokens = sourceSentence.ParagraphSimilarityData.ConsolidatedTokens.ToArray(),
                    ParagraphTopics = sourceSentence.ParagraphSimilarityData.ConsolidatedTopics.ToArray(),
                    SentenceTokens  = sourceSentence.SentenceSimilarityData.ConsolidatedTokens.ToArray(),
                    SentenceTopics  = sourceSentence.SentenceSimilarityData.ConsolidatedTopics.ToArray(),
                    SearchIn        = IndexObjectType.SentenceIndex
                };
            }
            else
            {
                var sourceParagraph = searchProvider.GetParagraph(sourceLanguage, sourceId);
                if (sourceParagraph == null)
                {
                    return(null);
                }

                alignQuery = new ParametrizedSearchQuery
                {
                    Language        = targetLanguage,
                    DocumentTokens  = sourceParagraph.DocumentSimilarityData.ConsolidatedTokens.ToArray(),
                    DocumentTopics  = sourceParagraph.DocumentSimilarityData.ConsolidatedTopics.ToArray(),
                    SectionTokens   = sourceParagraph.SectionSimilarityData.ConsolidatedTokens.ToArray(),
                    SectionTopics   = sourceParagraph.SectionSimilarityData.ConsolidatedTopics.ToArray(),
                    ParagraphTokens = sourceParagraph.ParagraphSimilarityData.ConsolidatedTokens.ToArray(),
                    ParagraphTopics = sourceParagraph.ParagraphSimilarityData.ConsolidatedTopics.ToArray(),
                    SearchIn        = IndexObjectType.ParagraphIndex
                };
            }

            return(searchProvider.PerformSearch <T>(alignQuery, 1, 1).ResultList.FirstOrDefault());
        }