/// <summary> /// Secondary Split - Best Suffix /// </summary> /// <param name="identifier">Identifier to split</param> protected override List <SplitWithIdentification> ApplySplit(string identifier) { // STEP 1 // split and identify with initial splitter List <SplitWithIdentification> initialSplitResults = null; if (InitialSplitter != null) { initialSplitResults = InitialSplitter.Split(identifier); } // STEP 2 // split with primary splitter List <SplitWithIdentification> identifierSplitResults = new List <SplitWithIdentification>(); if (initialSplitResults != null) { foreach (SplitWithIdentification split in initialSplitResults) { if (SplitterUtility.IsNotUnidentified(split.SplitIdentification)) { split.Split = split.Split.ToLowerInvariant(); identifierSplitResults.Add(split); } else { // Camel case just returns all caps. Check if it can be further splitted into tokens if (IsAllCaps(split.Split) && split.Split.Length >= IndexerResources.MinTokenLengthForCaps) { // Make it lower string so that we wont go over and over it List <SplitWithIdentification> capsSplits = ApplySplit(split.Split.ToLowerInvariant()); if (capsSplits.All(capsSplit => SplitterUtility.IsNotUnidentified(capsSplit.SplitIdentification))) { identifierSplitResults.AddRange(capsSplits); } else { identifierSplitResults.Add(split); } } else { identifierSplitResults.AddRange(BestSuffixSplit(split.Split)); } } } } else { identifierSplitResults.AddRange(BestSuffixSplit(identifier)); } return(identifierSplitResults); }
internal SplitSetScore(SplitPositionWithIdentification splitPositionWithIdentification, SplitSetScore nextSplitSetScore, int startIndex) { _startIndex = startIndex; _endIndex = splitPositionWithIdentification.Position; if (SplitterUtility.IsNotUnidentified(splitPositionWithIdentification.SplitIdentification)) { _identification = splitPositionWithIdentification.SplitIdentification; _identifiedSplitCount = 1; int length = _endIndex - _startIndex + 1; if (_identification == SplitIdentification.Identified) { _lettersInWordCount = length; } else { _lettersInTokenCount = length; } } _totalSplitsCount = 1; AddScore(nextSplitSetScore); }
/// <summary> /// Split implemented by particular splitter /// </summary> /// <param name="identifier">Identifier to split</param> /// <returns>Identifier Split Result</returns> public List <SplitWithIdentification> Split(string identifier) { if (_nonWordRegex.Replace(identifier, RegularExpressions.StringEmpty).Length < IndexerResources.MinTokenLength) { SplitWithIdentification returnSplit = Dictionary.IsWord(identifier) ? new SplitWithIdentification(_nonWordRegex.Replace(identifier, RegularExpressions.StringEmpty), SplitIdentification.Identified) : new SplitWithIdentification(_nonWordRegex.Replace(identifier, RegularExpressions.StringEmpty), SplitIdentification.SingleLetterIdentifier); return(new List <SplitWithIdentification>() { returnSplit }); } identifier = _escapedCharacterRegex.Replace(identifier, RegularExpressions.Space); List <SplitWithIdentification> splitResult = ApplySplit(identifier); // merge the unidentifier until no more merge is found // improper casing might have unintended split int splitCount = splitResult.Count; for (int splitCounter = 0; splitCounter < splitCount; splitCounter++) { if (splitResult[splitCounter].SplitIdentification != SplitIdentification.Unidentified) { continue; } string mergeLeftText = splitCounter > 0 ? splitResult[splitCounter - 1].Split.ToLowerInvariant() + splitResult[splitCounter].Split.ToLowerInvariant() : null; SplitIdentification mergeLeftIdentification = mergeLeftText == null ? SplitIdentification.Unidentified : GetSplitIdentification(mergeLeftText); string mergeRightText = splitCounter < splitCount - 1 ? splitResult[splitCounter].Split.ToLowerInvariant() + splitResult[splitCounter + 1].Split.ToLowerInvariant() : null; SplitIdentification mergeRightIdentification = mergeRightText == null ? SplitIdentification.Unidentified : GetSplitIdentification(mergeRightText); if (SplitterUtility.IsNotUnidentified(mergeRightIdentification)) { SplitWithIdentification newSplitWithIdentification = new SplitWithIdentification(mergeRightText, mergeRightIdentification); splitResult.RemoveAt(splitCounter + 1); splitResult.RemoveAt(splitCounter); splitResult.Insert(splitCounter, newSplitWithIdentification); splitCount--; continue; } else if (SplitterUtility.IsNotUnidentified(mergeLeftIdentification)) { SplitWithIdentification newSplitWithIdentification = new SplitWithIdentification(mergeLeftText, mergeLeftIdentification); splitResult.RemoveAt(splitCounter); splitResult.RemoveAt(splitCounter - 1); splitResult.Insert(splitCounter - 1, newSplitWithIdentification); splitCount--; splitCounter--; continue; } if (splitCounter < splitCount - 1 && splitResult[splitCounter + 1].SplitIdentification == SplitIdentification.Unidentified) { string unidentifiedMerged = splitResult[splitCounter].Split.ToLowerInvariant() + splitResult[splitCounter + 1].Split.ToLowerInvariant(); SplitIdentification unidentifiedMergedIdentification = GetSplitIdentification(unidentifiedMerged); if (SplitterUtility.IsNotUnidentified(unidentifiedMergedIdentification)) { SplitWithIdentification newSplitWithIdentification = new SplitWithIdentification(unidentifiedMerged, unidentifiedMergedIdentification); splitResult.RemoveAt(splitCounter + 1); splitResult.RemoveAt(splitCounter); splitResult.Insert(splitCounter, newSplitWithIdentification); splitCount--; } } } // Only run spell check when all split are done i.e. is primary splitter if (IsPrimarySplitter && TextCorrector != null && _resultPhase && identifier.Length >= IndexerResources.MinMisspelledStemmedLength) { List <SplitWithIdentification> splitResultWithCorrection = IdentifyMisspelled(splitResult); if (splitResultWithCorrection != null) { splitResult = splitResultWithCorrection; } } return(splitResult); }
/// <summary> /// Updates dictionary with tokens /// </summary> /// <param name="unsplittedText">String to be splitted and update dictionary.</param> public void UpdateTokenDictionary(string unsplittedText) { // STEP 1 // This step splits the identifiers with camel case. The first letter of each split is extracted and added as acronyms. // We will be using split only. We don't identify any word in this process. List <SplitWithIdentification> camelCaseSplitResults = CamelCaseSplitIdentifications(unsplittedText); if (camelCaseSplitResults.Count >= IndexerResources.MinTokenLength) { string abbr = string.Empty; camelCaseSplitResults.ForEach(split => abbr += split.Split.ElementAt(0)); TokenDictionary.AddAbbreviation(abbr, unsplittedText); } // STEP 2 // Split with initial splitter List <SplitWithIdentification> initialSplitterIdentifications; if (InitialSplitter == null) { initialSplitterIdentifications = new List <SplitWithIdentification>() { new SplitWithIdentification(unsplittedText, SplitIdentification.Unidentified) }; } else { if (InitialSplitter is CamelCaseSplitter) { initialSplitterIdentifications = camelCaseSplitResults; } else { initialSplitterIdentifications = InitialSplitter.Split(unsplittedText); } } // STEP 3 // Split with own splitter int splitCount = initialSplitterIdentifications.Count; for (int splitCounter = 0; splitCounter < splitCount; splitCounter++) { SplitWithIdentification split = initialSplitterIdentifications.ElementAt(splitCounter); string lowerCaseSplit = split.Split.ToLowerInvariant(); // if identified add to dictionary if (SplitterUtility.IsIdentified(split.SplitIdentification)) { TokenDictionary.AddIdentifiedInProject(lowerCaseSplit); continue; } // if its identified go to next if (SplitterUtility.IsNotUnidentified(split.SplitIdentification)) { continue; } // if the length meets minimum requirement, split it further if (SplitterUtility.CanBeToken(split.Split)) { List <SplitWithIdentification> innerPrimarySplit = Split(lowerCaseSplit); // case 1: Its all caps if (IsAllCaps(split.Split)) { // split it once to see if it was formed of some identified texts // this could be the case of natural token such as variable name written in all caps if (innerPrimarySplit.All(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification))) { innerPrimarySplit.Where(x => x.SplitIdentification == SplitIdentification.Identified).ToList().ForEach(x => TokenDictionary.AddIdentifiedInProject(x.Split)); initialSplitterIdentifications.RemoveAt(splitCounter); initialSplitterIdentifications.InsertRange(splitCounter, innerPrimarySplit); splitCount += innerPrimarySplit.Count - 1; continue; } // if all identified in the split meets minimum requirement, treat this as normal string rather than Capped abbreviation // if it does not since its all caps add to token if (SplitterUtility.CanBeToken(split.Split)) { TokenDictionary.AddToken(lowerCaseSplit); continue; } } // acronym if (IsAcronym(lowerCaseSplit)) { TokenDictionary.AddToken(lowerCaseSplit); continue; } if (innerPrimarySplit.All(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification))) { var identifieds = innerPrimarySplit.Where(x => x.SplitIdentification == SplitIdentification.Identified).ToList(); if (!identifieds.Any() || identifieds.Any(x => x.Split.Length >= IndexerResources.MinTokenLength)) { continue; } } if (Stemmer != null && lowerCaseSplit.Length >= IndexerResources.MinMisspelledStemmedLength) { // check if stemming gives any good result string stemmedText = Stemmer.GetStemmedText(lowerCaseSplit); if (stemmedText != null) { SplitIdentification stemmedTextIdentification = GetSplitIdentification(stemmedText); if (SplitterUtility.IsNotUnidentified(stemmedTextIdentification)) { if (stemmedTextIdentification == SplitIdentification.Identified) { initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.WordStemmed; TokenDictionary.AddStemmedWord(lowerCaseSplit, stemmedText); } else { initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.TokenStemmed; TokenDictionary.AddStemmedToken(lowerCaseSplit, stemmedText); } continue; } } } if (TextCorrector != null && lowerCaseSplit.Length >= IndexerResources.MinMisspelledStemmedLength) { // when everything fails we could save some computation for next time if the words identified so far can recognize this as misspelled string correctedText = TextCorrector.Correct(lowerCaseSplit); // we WONT be considering misspelled if the corrected term is started or ended with correction. Eg: awish is corrected to wish but it wont be handled. if (correctedText != null && !(lowerCaseSplit.StartsWith(correctedText) || lowerCaseSplit.EndsWith(correctedText))) { SplitIdentification correctedIdentification = GetSplitIdentification(correctedText); if (SplitterUtility.IsNotUnidentified(correctedIdentification)) { if (correctedIdentification == SplitIdentification.Identified) { initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.WordMisspelled; TokenDictionary.AddMisspelledWord(lowerCaseSplit, correctedText); } else { initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.TokenMisspelled; TokenDictionary.AddMisspelledToken(lowerCaseSplit, correctedText); } } continue; } } // if any split create an identified if (innerPrimarySplit.Any(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification) && SplitterUtility.CanBeToken(x.Split))) { initialSplitterIdentifications.RemoveAt(splitCounter); initialSplitterIdentifications.InsertRange(splitCounter, innerPrimarySplit); splitCount += innerPrimarySplit.Count - 1; splitCounter--; // evaluate the current split too for oversplit below continue; } // so far no good. Add it as possible token since it can be a token _tokenCounterDictionary.Add(lowerCaseSplit); } bool merged = false; // use merging to prevent oversplit if (splitCounter < splitCount - 1 && SplitterUtility.IsTokenOrIdentified(initialSplitterIdentifications[splitCounter + 1].SplitIdentification) && (splitCounter == 0 || initialSplitterIdentifications[splitCounter - 1].SplitIdentification != SplitIdentification.Token)) { string identifiedText = initialSplitterIdentifications[splitCounter + 1].Split.ToLowerInvariant(); string mergedText = initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant() + identifiedText; if (SplitterUtility.CanBeToken(mergedText)) { _mergedTokenCounterDictionary.Add(mergedText, new SplitWithIdentification(identifiedText, initialSplitterIdentifications[splitCounter + 1].SplitIdentification)); merged = true; } } if (splitCounter > 0 && SplitterUtility.IsTokenOrIdentified(initialSplitterIdentifications[splitCounter - 1].SplitIdentification) && (splitCounter == (splitCount - 1) || initialSplitterIdentifications[splitCounter + 1].SplitIdentification != SplitIdentification.Token)) { string identifiedText = initialSplitterIdentifications[splitCounter - 1].Split.ToLowerInvariant(); string mergedText = identifiedText + initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant(); if (SplitterUtility.CanBeToken(mergedText)) { _mergedTokenCounterDictionary.Add(mergedText, new SplitWithIdentification(identifiedText, initialSplitterIdentifications[splitCounter - 1].SplitIdentification)); merged = true; } } if (!merged) { if (splitCounter < splitCount - 1 && initialSplitterIdentifications[splitCounter + 1].SplitIdentification == SplitIdentification.Unidentified) { _tokenCounterDictionary.Add(initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant() + initialSplitterIdentifications[splitCounter + 1].Split.ToLowerInvariant()); } else if (splitCounter > 0 && initialSplitterIdentifications[splitCounter - 1].SplitIdentification == SplitIdentification.Unidentified) { _tokenCounterDictionary.Add(initialSplitterIdentifications[splitCounter - 1].Split.ToLowerInvariant() + initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant()); } } } List <string> tokens = _tokenCounterDictionary.GetValidStringListAndRemove(); // if tokens can be splitted into word or token add them as merged token. We will only consider one starting with unidentified followed by identified tokens.ForEach(token => { List <SplitWithIdentification> tokenSplits = Split(token); if (tokenSplits.All(split => SplitterUtility.IsNotUnidentified(split.SplitIdentification))) { return; } if (tokenSplits.Count == 2) { if (SplitterUtility.IsNotUnidentifiedAndNotMerged((tokenSplits[1].SplitIdentification))) { TokenDictionary.AddMergedToken(token, tokenSplits[1]); } else if (SplitterUtility.IsNotUnidentifiedAndNotMerged((tokenSplits[0].SplitIdentification))) { TokenDictionary.AddMergedToken(token, tokenSplits[0]); } } else { TokenDictionary.AddToken(token); } }); List <KeyValuePair <string, SplitWithIdentification> > mergedTokens = _mergedTokenCounterDictionary.GetValidItemListAndRemove(); mergedTokens.ForEach(x => TokenDictionary.AddMergedToken(x.Key, x.Value)); }