Пример #1
0
        /// <summary>
        /// Updates dictionary with tokens
        /// </summary>
        /// <param name="unsplittedText">String to be splitted and update dictionary.</param>
        public void UpdateTokenDictionary(string unsplittedText)
        {
            // STEP 1
            // This step splits the identifiers with camel case. The first letter of each split is extracted and added as acronyms.
            // We will be using split only. We don't identify any word in this process.
            List <SplitWithIdentification> camelCaseSplitResults = CamelCaseSplitIdentifications(unsplittedText);

            if (camelCaseSplitResults.Count >= IndexerResources.MinTokenLength)
            {
                string abbr = string.Empty;
                camelCaseSplitResults.ForEach(split => abbr += split.Split.ElementAt(0));
                TokenDictionary.AddAbbreviation(abbr, unsplittedText);
            }

            // STEP 2
            // Split with initial splitter
            List <SplitWithIdentification> initialSplitterIdentifications;

            if (InitialSplitter == null)
            {
                initialSplitterIdentifications = new List <SplitWithIdentification>()
                {
                    new SplitWithIdentification(unsplittedText, SplitIdentification.Unidentified)
                };
            }
            else
            {
                if (InitialSplitter is CamelCaseSplitter)
                {
                    initialSplitterIdentifications = camelCaseSplitResults;
                }
                else
                {
                    initialSplitterIdentifications = InitialSplitter.Split(unsplittedText);
                }
            }

            // STEP 3
            // Split with own splitter
            int splitCount = initialSplitterIdentifications.Count;

            for (int splitCounter = 0; splitCounter < splitCount; splitCounter++)
            {
                SplitWithIdentification split = initialSplitterIdentifications.ElementAt(splitCounter);
                string lowerCaseSplit         = split.Split.ToLowerInvariant();

                // if identified add to dictionary
                if (SplitterUtility.IsIdentified(split.SplitIdentification))
                {
                    TokenDictionary.AddIdentifiedInProject(lowerCaseSplit);
                    continue;
                }

                // if its identified go to next
                if (SplitterUtility.IsNotUnidentified(split.SplitIdentification))
                {
                    continue;
                }

                // if the length meets minimum requirement, split it further
                if (SplitterUtility.CanBeToken(split.Split))
                {
                    List <SplitWithIdentification> innerPrimarySplit = Split(lowerCaseSplit);

                    // case 1: Its all caps
                    if (IsAllCaps(split.Split))
                    {
                        // split it once to see if it was formed of some identified texts
                        // this could be the case of natural token such as variable name written in all caps
                        if (innerPrimarySplit.All(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification)))
                        {
                            innerPrimarySplit.Where(x => x.SplitIdentification == SplitIdentification.Identified).ToList().ForEach(x => TokenDictionary.AddIdentifiedInProject(x.Split));
                            initialSplitterIdentifications.RemoveAt(splitCounter);
                            initialSplitterIdentifications.InsertRange(splitCounter, innerPrimarySplit);
                            splitCount += innerPrimarySplit.Count - 1;
                            continue;
                        }

                        // if all identified in the split meets minimum requirement, treat this as normal string rather than Capped abbreviation
                        // if it does not since its all caps add to token
                        if (SplitterUtility.CanBeToken(split.Split))
                        {
                            TokenDictionary.AddToken(lowerCaseSplit);
                            continue;
                        }
                    }

                    // acronym
                    if (IsAcronym(lowerCaseSplit))
                    {
                        TokenDictionary.AddToken(lowerCaseSplit);
                        continue;
                    }

                    if (innerPrimarySplit.All(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification)))
                    {
                        var identifieds = innerPrimarySplit.Where(x => x.SplitIdentification == SplitIdentification.Identified).ToList();
                        if (!identifieds.Any() || identifieds.Any(x => x.Split.Length >= IndexerResources.MinTokenLength))
                        {
                            continue;
                        }
                    }

                    if (Stemmer != null && lowerCaseSplit.Length >= IndexerResources.MinMisspelledStemmedLength)
                    {
                        // check if stemming gives any good result
                        string stemmedText = Stemmer.GetStemmedText(lowerCaseSplit);
                        if (stemmedText != null)
                        {
                            SplitIdentification stemmedTextIdentification = GetSplitIdentification(stemmedText);
                            if (SplitterUtility.IsNotUnidentified(stemmedTextIdentification))
                            {
                                if (stemmedTextIdentification == SplitIdentification.Identified)
                                {
                                    initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.WordStemmed;
                                    TokenDictionary.AddStemmedWord(lowerCaseSplit, stemmedText);
                                }
                                else
                                {
                                    initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.TokenStemmed;
                                    TokenDictionary.AddStemmedToken(lowerCaseSplit, stemmedText);
                                }
                                continue;
                            }
                        }
                    }

                    if (TextCorrector != null && lowerCaseSplit.Length >= IndexerResources.MinMisspelledStemmedLength)
                    {
                        // when everything fails we could save some computation for next time if the words identified so far can recognize this as misspelled
                        string correctedText = TextCorrector.Correct(lowerCaseSplit);
                        // we WONT be considering misspelled if the corrected term is started or ended with correction. Eg: awish is corrected to wish but it wont be handled.
                        if (correctedText != null && !(lowerCaseSplit.StartsWith(correctedText) || lowerCaseSplit.EndsWith(correctedText)))
                        {
                            SplitIdentification correctedIdentification = GetSplitIdentification(correctedText);
                            if (SplitterUtility.IsNotUnidentified(correctedIdentification))
                            {
                                if (correctedIdentification == SplitIdentification.Identified)
                                {
                                    initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.WordMisspelled;
                                    TokenDictionary.AddMisspelledWord(lowerCaseSplit, correctedText);
                                }
                                else
                                {
                                    initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.TokenMisspelled;
                                    TokenDictionary.AddMisspelledToken(lowerCaseSplit, correctedText);
                                }
                            }
                            continue;
                        }
                    }

                    // if any split create an identified
                    if (innerPrimarySplit.Any(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification) && SplitterUtility.CanBeToken(x.Split)))
                    {
                        initialSplitterIdentifications.RemoveAt(splitCounter);
                        initialSplitterIdentifications.InsertRange(splitCounter, innerPrimarySplit);
                        splitCount += innerPrimarySplit.Count - 1;
                        splitCounter--; // evaluate the current split too for oversplit below
                        continue;
                    }

                    // so far no good. Add it as possible token since it can be a token
                    _tokenCounterDictionary.Add(lowerCaseSplit);
                }

                bool merged = false;
                // use merging to prevent oversplit
                if (splitCounter < splitCount - 1 && SplitterUtility.IsTokenOrIdentified(initialSplitterIdentifications[splitCounter + 1].SplitIdentification) &&
                    (splitCounter == 0 || initialSplitterIdentifications[splitCounter - 1].SplitIdentification != SplitIdentification.Token))
                {
                    string identifiedText = initialSplitterIdentifications[splitCounter + 1].Split.ToLowerInvariant();
                    string mergedText     = initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant() + identifiedText;
                    if (SplitterUtility.CanBeToken(mergedText))
                    {
                        _mergedTokenCounterDictionary.Add(mergedText, new SplitWithIdentification(identifiedText, initialSplitterIdentifications[splitCounter + 1].SplitIdentification));
                        merged = true;
                    }
                }

                if (splitCounter > 0 && SplitterUtility.IsTokenOrIdentified(initialSplitterIdentifications[splitCounter - 1].SplitIdentification) &&
                    (splitCounter == (splitCount - 1) || initialSplitterIdentifications[splitCounter + 1].SplitIdentification != SplitIdentification.Token))
                {
                    string identifiedText = initialSplitterIdentifications[splitCounter - 1].Split.ToLowerInvariant();
                    string mergedText     = identifiedText + initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant();
                    if (SplitterUtility.CanBeToken(mergedText))
                    {
                        _mergedTokenCounterDictionary.Add(mergedText, new SplitWithIdentification(identifiedText, initialSplitterIdentifications[splitCounter - 1].SplitIdentification));
                        merged = true;
                    }
                }

                if (!merged)
                {
                    if (splitCounter < splitCount - 1 && initialSplitterIdentifications[splitCounter + 1].SplitIdentification == SplitIdentification.Unidentified)
                    {
                        _tokenCounterDictionary.Add(initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant() + initialSplitterIdentifications[splitCounter + 1].Split.ToLowerInvariant());
                    }
                    else if (splitCounter > 0 && initialSplitterIdentifications[splitCounter - 1].SplitIdentification == SplitIdentification.Unidentified)
                    {
                        _tokenCounterDictionary.Add(initialSplitterIdentifications[splitCounter - 1].Split.ToLowerInvariant() + initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant());
                    }
                }
            }

            List <string> tokens = _tokenCounterDictionary.GetValidStringListAndRemove();

            // if tokens can be splitted into word or token add them as merged token. We will only consider one starting with unidentified followed by identified
            tokens.ForEach(token =>
            {
                List <SplitWithIdentification> tokenSplits = Split(token);
                if (tokenSplits.All(split => SplitterUtility.IsNotUnidentified(split.SplitIdentification)))
                {
                    return;
                }

                if (tokenSplits.Count == 2)
                {
                    if (SplitterUtility.IsNotUnidentifiedAndNotMerged((tokenSplits[1].SplitIdentification)))
                    {
                        TokenDictionary.AddMergedToken(token, tokenSplits[1]);
                    }
                    else if (SplitterUtility.IsNotUnidentifiedAndNotMerged((tokenSplits[0].SplitIdentification)))
                    {
                        TokenDictionary.AddMergedToken(token, tokenSplits[0]);
                    }
                }
                else
                {
                    TokenDictionary.AddToken(token);
                }
            });

            List <KeyValuePair <string, SplitWithIdentification> > mergedTokens = _mergedTokenCounterDictionary.GetValidItemListAndRemove();

            mergedTokens.ForEach(x => TokenDictionary.AddMergedToken(x.Key, x.Value));
        }