/// <summary> /// Updates dictionary with tokens /// </summary> /// <param name="unsplittedText">String to be splitted and update dictionary.</param> public void UpdateTokenDictionary(string unsplittedText) { // STEP 1 // This step splits the identifiers with camel case. The first letter of each split is extracted and added as acronyms. // We will be using split only. We don't identify any word in this process. List <SplitWithIdentification> camelCaseSplitResults = CamelCaseSplitIdentifications(unsplittedText); if (camelCaseSplitResults.Count >= IndexerResources.MinTokenLength) { string abbr = string.Empty; camelCaseSplitResults.ForEach(split => abbr += split.Split.ElementAt(0)); TokenDictionary.AddAbbreviation(abbr, unsplittedText); } // STEP 2 // Split with initial splitter List <SplitWithIdentification> initialSplitterIdentifications; if (InitialSplitter == null) { initialSplitterIdentifications = new List <SplitWithIdentification>() { new SplitWithIdentification(unsplittedText, SplitIdentification.Unidentified) }; } else { if (InitialSplitter is CamelCaseSplitter) { initialSplitterIdentifications = camelCaseSplitResults; } else { initialSplitterIdentifications = InitialSplitter.Split(unsplittedText); } } // STEP 3 // Split with own splitter int splitCount = initialSplitterIdentifications.Count; for (int splitCounter = 0; splitCounter < splitCount; splitCounter++) { SplitWithIdentification split = initialSplitterIdentifications.ElementAt(splitCounter); string lowerCaseSplit = split.Split.ToLowerInvariant(); // if identified add to dictionary if (SplitterUtility.IsIdentified(split.SplitIdentification)) { TokenDictionary.AddIdentifiedInProject(lowerCaseSplit); continue; } // if its identified go to next if (SplitterUtility.IsNotUnidentified(split.SplitIdentification)) { continue; } // if the length meets minimum requirement, split it further if (SplitterUtility.CanBeToken(split.Split)) { List <SplitWithIdentification> innerPrimarySplit = Split(lowerCaseSplit); // case 1: Its all caps if (IsAllCaps(split.Split)) { // split it once to see if it was formed of some identified texts // this could be the case of natural token such as variable name written in all caps if (innerPrimarySplit.All(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification))) { innerPrimarySplit.Where(x => x.SplitIdentification == SplitIdentification.Identified).ToList().ForEach(x => TokenDictionary.AddIdentifiedInProject(x.Split)); initialSplitterIdentifications.RemoveAt(splitCounter); initialSplitterIdentifications.InsertRange(splitCounter, innerPrimarySplit); splitCount += innerPrimarySplit.Count - 1; continue; } // if all identified in the split meets minimum requirement, treat this as normal string rather than Capped abbreviation // if it does not since its all caps add to token if (SplitterUtility.CanBeToken(split.Split)) { TokenDictionary.AddToken(lowerCaseSplit); continue; } } // acronym if (IsAcronym(lowerCaseSplit)) { TokenDictionary.AddToken(lowerCaseSplit); continue; } if (innerPrimarySplit.All(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification))) { var identifieds = innerPrimarySplit.Where(x => x.SplitIdentification == SplitIdentification.Identified).ToList(); if (!identifieds.Any() || identifieds.Any(x => x.Split.Length >= IndexerResources.MinTokenLength)) { continue; } } if (Stemmer != null && lowerCaseSplit.Length >= IndexerResources.MinMisspelledStemmedLength) { // check if stemming gives any good result string stemmedText = Stemmer.GetStemmedText(lowerCaseSplit); if (stemmedText != null) { SplitIdentification stemmedTextIdentification = GetSplitIdentification(stemmedText); if (SplitterUtility.IsNotUnidentified(stemmedTextIdentification)) { if (stemmedTextIdentification == SplitIdentification.Identified) { initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.WordStemmed; TokenDictionary.AddStemmedWord(lowerCaseSplit, stemmedText); } else { initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.TokenStemmed; TokenDictionary.AddStemmedToken(lowerCaseSplit, stemmedText); } continue; } } } if (TextCorrector != null && lowerCaseSplit.Length >= IndexerResources.MinMisspelledStemmedLength) { // when everything fails we could save some computation for next time if the words identified so far can recognize this as misspelled string correctedText = TextCorrector.Correct(lowerCaseSplit); // we WONT be considering misspelled if the corrected term is started or ended with correction. Eg: awish is corrected to wish but it wont be handled. if (correctedText != null && !(lowerCaseSplit.StartsWith(correctedText) || lowerCaseSplit.EndsWith(correctedText))) { SplitIdentification correctedIdentification = GetSplitIdentification(correctedText); if (SplitterUtility.IsNotUnidentified(correctedIdentification)) { if (correctedIdentification == SplitIdentification.Identified) { initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.WordMisspelled; TokenDictionary.AddMisspelledWord(lowerCaseSplit, correctedText); } else { initialSplitterIdentifications.ElementAt(splitCounter).SplitIdentification = SplitIdentification.TokenMisspelled; TokenDictionary.AddMisspelledToken(lowerCaseSplit, correctedText); } } continue; } } // if any split create an identified if (innerPrimarySplit.Any(x => SplitterUtility.IsNotUnidentified(x.SplitIdentification) && SplitterUtility.CanBeToken(x.Split))) { initialSplitterIdentifications.RemoveAt(splitCounter); initialSplitterIdentifications.InsertRange(splitCounter, innerPrimarySplit); splitCount += innerPrimarySplit.Count - 1; splitCounter--; // evaluate the current split too for oversplit below continue; } // so far no good. Add it as possible token since it can be a token _tokenCounterDictionary.Add(lowerCaseSplit); } bool merged = false; // use merging to prevent oversplit if (splitCounter < splitCount - 1 && SplitterUtility.IsTokenOrIdentified(initialSplitterIdentifications[splitCounter + 1].SplitIdentification) && (splitCounter == 0 || initialSplitterIdentifications[splitCounter - 1].SplitIdentification != SplitIdentification.Token)) { string identifiedText = initialSplitterIdentifications[splitCounter + 1].Split.ToLowerInvariant(); string mergedText = initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant() + identifiedText; if (SplitterUtility.CanBeToken(mergedText)) { _mergedTokenCounterDictionary.Add(mergedText, new SplitWithIdentification(identifiedText, initialSplitterIdentifications[splitCounter + 1].SplitIdentification)); merged = true; } } if (splitCounter > 0 && SplitterUtility.IsTokenOrIdentified(initialSplitterIdentifications[splitCounter - 1].SplitIdentification) && (splitCounter == (splitCount - 1) || initialSplitterIdentifications[splitCounter + 1].SplitIdentification != SplitIdentification.Token)) { string identifiedText = initialSplitterIdentifications[splitCounter - 1].Split.ToLowerInvariant(); string mergedText = identifiedText + initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant(); if (SplitterUtility.CanBeToken(mergedText)) { _mergedTokenCounterDictionary.Add(mergedText, new SplitWithIdentification(identifiedText, initialSplitterIdentifications[splitCounter - 1].SplitIdentification)); merged = true; } } if (!merged) { if (splitCounter < splitCount - 1 && initialSplitterIdentifications[splitCounter + 1].SplitIdentification == SplitIdentification.Unidentified) { _tokenCounterDictionary.Add(initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant() + initialSplitterIdentifications[splitCounter + 1].Split.ToLowerInvariant()); } else if (splitCounter > 0 && initialSplitterIdentifications[splitCounter - 1].SplitIdentification == SplitIdentification.Unidentified) { _tokenCounterDictionary.Add(initialSplitterIdentifications[splitCounter - 1].Split.ToLowerInvariant() + initialSplitterIdentifications[splitCounter].Split.ToLowerInvariant()); } } } List <string> tokens = _tokenCounterDictionary.GetValidStringListAndRemove(); // if tokens can be splitted into word or token add them as merged token. We will only consider one starting with unidentified followed by identified tokens.ForEach(token => { List <SplitWithIdentification> tokenSplits = Split(token); if (tokenSplits.All(split => SplitterUtility.IsNotUnidentified(split.SplitIdentification))) { return; } if (tokenSplits.Count == 2) { if (SplitterUtility.IsNotUnidentifiedAndNotMerged((tokenSplits[1].SplitIdentification))) { TokenDictionary.AddMergedToken(token, tokenSplits[1]); } else if (SplitterUtility.IsNotUnidentifiedAndNotMerged((tokenSplits[0].SplitIdentification))) { TokenDictionary.AddMergedToken(token, tokenSplits[0]); } } else { TokenDictionary.AddToken(token); } }); List <KeyValuePair <string, SplitWithIdentification> > mergedTokens = _mergedTokenCounterDictionary.GetValidItemListAndRemove(); mergedTokens.ForEach(x => TokenDictionary.AddMergedToken(x.Key, x.Value)); }