Ejemplo n.º 1
0
        /// <summary>
        /// Secondary Split - Best Suffix
        /// </summary>
        /// <param name="identifier">Identifier to split</param>
        protected override List <SplitWithIdentification> ApplySplit(string identifier)
        {
            // STEP 1
            // split and identify with initial splitter
            List <SplitWithIdentification> initialSplitResults = null;

            if (InitialSplitter != null)
            {
                initialSplitResults = InitialSplitter.Split(identifier);
            }

            // STEP 2
            // split with primary splitter
            List <SplitWithIdentification> identifierSplitResults = new List <SplitWithIdentification>();

            if (initialSplitResults != null)
            {
                foreach (SplitWithIdentification split in initialSplitResults)
                {
                    if (SplitterUtility.IsNotUnidentified(split.SplitIdentification))
                    {
                        split.Split = split.Split.ToLowerInvariant();
                        identifierSplitResults.Add(split);
                    }
                    else
                    {
                        // Camel case just returns all caps. Check if it can be further splitted into tokens
                        if (IsAllCaps(split.Split) && split.Split.Length >= IndexerResources.MinTokenLengthForCaps)
                        {
                            // Make it lower string so that we wont go over and over it
                            List <SplitWithIdentification> capsSplits = ApplySplit(split.Split.ToLowerInvariant());
                            if (capsSplits.All(capsSplit => SplitterUtility.IsNotUnidentified(capsSplit.SplitIdentification)))
                            {
                                identifierSplitResults.AddRange(capsSplits);
                            }
                            else
                            {
                                identifierSplitResults.Add(split);
                            }
                        }
                        else
                        {
                            identifierSplitResults.AddRange(BestSuffixSplit(split.Split));
                        }
                    }
                }
            }
            else
            {
                identifierSplitResults.AddRange(BestSuffixSplit(identifier));
            }
            return(identifierSplitResults);
        }
Ejemplo n.º 2
0
            internal SplitSetScore(SplitPositionWithIdentification splitPositionWithIdentification, SplitSetScore nextSplitSetScore, int startIndex)
            {
                _startIndex = startIndex;
                _endIndex   = splitPositionWithIdentification.Position;

                if (SplitterUtility.IsNotUnidentified(splitPositionWithIdentification.SplitIdentification))
                {
                    _identification       = splitPositionWithIdentification.SplitIdentification;
                    _identifiedSplitCount = 1;
                    int length = _endIndex - _startIndex + 1;
                    if (_identification == SplitIdentification.Identified)
                    {
                        _lettersInWordCount = length;
                    }
                    else
                    {
                        _lettersInTokenCount = length;
                    }
                }

                _totalSplitsCount = 1;
                AddScore(nextSplitSetScore);
            }
Ejemplo n.º 3
0
        /// <summary>
        /// Split implemented by particular splitter
        /// </summary>
        /// <param name="identifier">Identifier to split</param>
        /// <returns>Identifier Split Result</returns>
        public List <SplitWithIdentification> Split(string identifier)
        {
            if (_nonWordRegex.Replace(identifier, RegularExpressions.StringEmpty).Length < IndexerResources.MinTokenLength)
            {
                SplitWithIdentification returnSplit = Dictionary.IsWord(identifier)
                    ? new SplitWithIdentification(_nonWordRegex.Replace(identifier, RegularExpressions.StringEmpty), SplitIdentification.Identified)
                    : new SplitWithIdentification(_nonWordRegex.Replace(identifier, RegularExpressions.StringEmpty), SplitIdentification.SingleLetterIdentifier);
                return(new List <SplitWithIdentification>()
                {
                    returnSplit
                });
            }
            identifier = _escapedCharacterRegex.Replace(identifier, RegularExpressions.Space);

            List <SplitWithIdentification> splitResult = ApplySplit(identifier);

            // merge the unidentifier until no more merge is found
            // improper casing might have unintended split
            int splitCount = splitResult.Count;

            for (int splitCounter = 0; splitCounter < splitCount; splitCounter++)
            {
                if (splitResult[splitCounter].SplitIdentification != SplitIdentification.Unidentified)
                {
                    continue;
                }

                string mergeLeftText = splitCounter > 0 ? splitResult[splitCounter - 1].Split.ToLowerInvariant() + splitResult[splitCounter].Split.ToLowerInvariant() : null;
                SplitIdentification mergeLeftIdentification = mergeLeftText == null ? SplitIdentification.Unidentified : GetSplitIdentification(mergeLeftText);

                string mergeRightText = splitCounter < splitCount - 1 ? splitResult[splitCounter].Split.ToLowerInvariant() + splitResult[splitCounter + 1].Split.ToLowerInvariant() : null;
                SplitIdentification mergeRightIdentification = mergeRightText == null ? SplitIdentification.Unidentified : GetSplitIdentification(mergeRightText);

                if (SplitterUtility.IsNotUnidentified(mergeRightIdentification))
                {
                    SplitWithIdentification newSplitWithIdentification = new SplitWithIdentification(mergeRightText, mergeRightIdentification);
                    splitResult.RemoveAt(splitCounter + 1);
                    splitResult.RemoveAt(splitCounter);
                    splitResult.Insert(splitCounter, newSplitWithIdentification);
                    splitCount--;
                    continue;
                }
                else if (SplitterUtility.IsNotUnidentified(mergeLeftIdentification))
                {
                    SplitWithIdentification newSplitWithIdentification = new SplitWithIdentification(mergeLeftText, mergeLeftIdentification);
                    splitResult.RemoveAt(splitCounter);
                    splitResult.RemoveAt(splitCounter - 1);
                    splitResult.Insert(splitCounter - 1, newSplitWithIdentification);
                    splitCount--;
                    splitCounter--;
                    continue;
                }

                if (splitCounter < splitCount - 1 && splitResult[splitCounter + 1].SplitIdentification == SplitIdentification.Unidentified)
                {
                    string unidentifiedMerged = splitResult[splitCounter].Split.ToLowerInvariant() + splitResult[splitCounter + 1].Split.ToLowerInvariant();
                    SplitIdentification unidentifiedMergedIdentification = GetSplitIdentification(unidentifiedMerged);
                    if (SplitterUtility.IsNotUnidentified(unidentifiedMergedIdentification))
                    {
                        SplitWithIdentification newSplitWithIdentification = new SplitWithIdentification(unidentifiedMerged, unidentifiedMergedIdentification);
                        splitResult.RemoveAt(splitCounter + 1);
                        splitResult.RemoveAt(splitCounter);
                        splitResult.Insert(splitCounter, newSplitWithIdentification);
                        splitCount--;
                    }
                }
            }

            // Only run spell check when all split are done i.e. is primary splitter
            if (IsPrimarySplitter && TextCorrector != null && _resultPhase && identifier.Length >= IndexerResources.MinMisspelledStemmedLength)
            {
                List <SplitWithIdentification> splitResultWithCorrection = IdentifyMisspelled(splitResult);
                if (splitResultWithCorrection != null)
                {
                    splitResult = splitResultWithCorrection;
                }
            }

            return(splitResult);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Updates dictionary with tokens
        /// </summary>
        /// <param name="unsplittedText">String to be splitted and update dictionary.</param>
        public void UpdateTokenDictionary(string unsplittedText)
        {
            // STEP 1
            // This step splits the identifiers with camel case. The first letter of each split is extracted and added as acronyms.
            // We will be using split only. We don't identify any word in this process.
            List <SplitWithIdentification> camelCaseSplitResults = CamelCaseSplitIdentifications(unsplittedText);

            if (camelCaseSplitResults.Count >= IndexerResources.MinTokenLength)
            {
                string abbr = string.Empty;
                camelCaseSplitResults.ForEach(split => abbr += split.Split.ElementAt(0));
                TokenDictionary.AddAbbreviation(abbr, unsplittedText);
            }

            // STEP 2
            // Split with initial splitter
            List <SplitWithIdentification> initialSplitterIdentifications;

            if (InitialSplitter == null)
            {
                initialSplitterIdentifications = new List <SplitWithIdentification>()
                {
                    new SplitWithIdentification(unsplittedText, SplitIdentification.Unidentified)
                };
            }
            else
            {
                if (InitialSplitter is CamelCaseSplitter)
                {
                    initialSplitterIdentifications = camelCaseSplitResults;
                }
                else
                {
                    initialSplitterIdentifications = InitialSplitter.Split(unsplittedText);
                }
            }

            // STEP 3
            // Split with own splitter
            int splitCount = initialSplitterIdentifications.Count;

            for (int splitCounter = 0; splitCounter < splitCount; splitCounter++)
            {
                SplitWithIdentification split = initialSplitterIdentifications.ElementAt(splitCounter);
                string lowerCaseSplit         = split.Split.ToLowerInvariant();

                // if identified add to dictionary
                if (SplitterUtility.IsIdentified(split.SplitIdentification))
                {
                    TokenDictionary.AddIdentifiedInProject(lowerCaseSplit);
                    continue;
                }

                // if its identified go to next
                if (SplitterUtility.IsNotUnidentified(split.SplitIdentification))
                {
                    continue;
                }

                // if the length meets minimum requirement, split it further
                if (SplitterUtility.CanBeToken(split.Split))
                {
                    List <SplitWithIdentification> innerPrimarySplit = Split(lowerCaseSplit);

                    // case 1: Its all caps
                    if (IsAllCaps(split.Split))
                    {
                        // split it once to see if it was formed of some identified texts
                        // this could be the case of natural token such as variable name written in all caps
                        if (innerPrimarySplit.All(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification)))
                        {
                            innerPrimarySplit.Where(x => x.SplitIdentification == SplitIdentification.Identified).ToList().ForEach(x => TokenDictionary.AddIdentifiedInProject(x.Split));
                            initialSplitterIdentifications.RemoveAt(splitCounter);
                            initialSplitterIdentifications.InsertRange(splitCounter, innerPrimarySplit);
                            splitCount += innerPrimarySplit.Count - 1;
                            continue;
                        }

                        // if all identified in the split meets minimum requirement, treat this as normal string rather than Capped abbreviation
                        // if it does not since its all caps add to token
                        if (SplitterUtility.CanBeToken(split.Split))
                        {
                            TokenDictionary.AddToken(lowerCaseSplit);
                            continue;
                        }
                    }

                    // acronym
                    if (IsAcronym(lowerCaseSplit))
                    {
                        TokenDictionary.AddToken(lowerCaseSplit);
                        continue;
                    }

                    if (innerPrimarySplit.All(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification)))
                    {
                        var identifieds = innerPrimarySplit.Where(x => x.SplitIdentification == SplitIdentification.Identified).ToList();
                        if (!identifieds.Any() || identifieds.Any(x => x.Split.Length >= IndexerResources.MinTokenLength))
                        {
                            continue;
                        }
                    }

                    if (Stemmer != null && lowerCaseSplit.Length >= IndexerResources.MinMisspelledStemmedLength)
                    {
                        // check if stemming gives any good result
                        string stemmedText = Stemmer.GetStemmedText(lowerCaseSplit);
                        if (stemmedText != null)
                        {
                            SplitIdentification stemmedTextIdentification = GetSplitIdentification(stemmedText);
                            if (SplitterUtility.IsNotUnidentified(stemmedTextIdentification))
                            {
                                if (stemmedTextIdentification == SplitIdentification.Identified)
                                {
                                    initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.WordStemmed;
                                    TokenDictionary.AddStemmedWord(lowerCaseSplit, stemmedText);
                                }
                                else
                                {
                                    initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.TokenStemmed;
                                    TokenDictionary.AddStemmedToken(lowerCaseSplit, stemmedText);
                                }
                                continue;
                            }
                        }
                    }

                    if (TextCorrector != null && lowerCaseSplit.Length >= IndexerResources.MinMisspelledStemmedLength)
                    {
                        // when everything fails we could save some computation for next time if the words identified so far can recognize this as misspelled
                        string correctedText = TextCorrector.Correct(lowerCaseSplit);
                        // we WONT be considering misspelled if the corrected term is started or ended with correction. Eg: awish is corrected to wish but it wont be handled.
                        if (correctedText != null && !(lowerCaseSplit.StartsWith(correctedText) || lowerCaseSplit.EndsWith(correctedText)))
                        {
                            SplitIdentification correctedIdentification = GetSplitIdentification(correctedText);
                            if (SplitterUtility.IsNotUnidentified(correctedIdentification))
                            {
                                if (correctedIdentification == SplitIdentification.Identified)
                                {
                                    initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.WordMisspelled;
                                    TokenDictionary.AddMisspelledWord(lowerCaseSplit, correctedText);
                                }
                                else
                                {
                                    initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.TokenMisspelled;
                                    TokenDictionary.AddMisspelledToken(lowerCaseSplit, correctedText);
                                }
                            }
                            continue;
                        }
                    }

                    // if any split create an identified
                    if (innerPrimarySplit.Any(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification) && SplitterUtility.CanBeToken(x.Split)))
                    {
                        initialSplitterIdentifications.RemoveAt(splitCounter);
                        initialSplitterIdentifications.InsertRange(splitCounter, innerPrimarySplit);
                        splitCount += innerPrimarySplit.Count - 1;
                        splitCounter--; // evaluate the current split too for oversplit below
                        continue;
                    }

                    // so far no good. Add it as possible token since it can be a token
                    _tokenCounterDictionary.Add(lowerCaseSplit);
                }

                bool merged = false;
                // use merging to prevent oversplit
                if (splitCounter < splitCount - 1 && SplitterUtility.IsTokenOrIdentified(initialSplitterIdentifications[splitCounter + 1].SplitIdentification) &&
                    (splitCounter == 0 || initialSplitterIdentifications[splitCounter - 1].SplitIdentification != SplitIdentification.Token))
                {
                    string identifiedText = initialSplitterIdentifications[splitCounter + 1].Split.ToLowerInvariant();
                    string mergedText     = initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant() + identifiedText;
                    if (SplitterUtility.CanBeToken(mergedText))
                    {
                        _mergedTokenCounterDictionary.Add(mergedText, new SplitWithIdentification(identifiedText, initialSplitterIdentifications[splitCounter + 1].SplitIdentification));
                        merged = true;
                    }
                }

                if (splitCounter > 0 && SplitterUtility.IsTokenOrIdentified(initialSplitterIdentifications[splitCounter - 1].SplitIdentification) &&
                    (splitCounter == (splitCount - 1) || initialSplitterIdentifications[splitCounter + 1].SplitIdentification != SplitIdentification.Token))
                {
                    string identifiedText = initialSplitterIdentifications[splitCounter - 1].Split.ToLowerInvariant();
                    string mergedText     = identifiedText + initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant();
                    if (SplitterUtility.CanBeToken(mergedText))
                    {
                        _mergedTokenCounterDictionary.Add(mergedText, new SplitWithIdentification(identifiedText, initialSplitterIdentifications[splitCounter - 1].SplitIdentification));
                        merged = true;
                    }
                }

                if (!merged)
                {
                    if (splitCounter < splitCount - 1 && initialSplitterIdentifications[splitCounter + 1].SplitIdentification == SplitIdentification.Unidentified)
                    {
                        _tokenCounterDictionary.Add(initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant() + initialSplitterIdentifications[splitCounter + 1].Split.ToLowerInvariant());
                    }
                    else if (splitCounter > 0 && initialSplitterIdentifications[splitCounter - 1].SplitIdentification == SplitIdentification.Unidentified)
                    {
                        _tokenCounterDictionary.Add(initialSplitterIdentifications[splitCounter - 1].Split.ToLowerInvariant() + initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant());
                    }
                }
            }

            List <string> tokens = _tokenCounterDictionary.GetValidStringListAndRemove();

            // if tokens can be splitted into word or token add them as merged token. We will only consider one starting with unidentified followed by identified
            tokens.ForEach(token =>
            {
                List <SplitWithIdentification> tokenSplits = Split(token);
                if (tokenSplits.All(split => SplitterUtility.IsNotUnidentified(split.SplitIdentification)))
                {
                    return;
                }

                if (tokenSplits.Count == 2)
                {
                    if (SplitterUtility.IsNotUnidentifiedAndNotMerged((tokenSplits[1].SplitIdentification)))
                    {
                        TokenDictionary.AddMergedToken(token, tokenSplits[1]);
                    }
                    else if (SplitterUtility.IsNotUnidentifiedAndNotMerged((tokenSplits[0].SplitIdentification)))
                    {
                        TokenDictionary.AddMergedToken(token, tokenSplits[0]);
                    }
                }
                else
                {
                    TokenDictionary.AddToken(token);
                }
            });

            List <KeyValuePair <string, SplitWithIdentification> > mergedTokens = _mergedTokenCounterDictionary.GetValidItemListAndRemove();

            mergedTokens.ForEach(x => TokenDictionary.AddMergedToken(x.Key, x.Value));
        }