internal bool IsBetterThan(SplitSetScore s1) { if (s1 == null) { return(true); } if ((_lettersInWordCount + _lettersInTokenCount) != (s1._lettersInWordCount + s1._lettersInTokenCount)) { return((_lettersInWordCount + _lettersInTokenCount) > (s1._lettersInWordCount + s1._lettersInTokenCount)); } if (_lettersInWordCount != s1._lettersInWordCount) { return(_lettersInWordCount > s1._lettersInWordCount); } if (_totalSplitsCount != s1._totalSplitsCount) { return(_totalSplitsCount < s1._totalSplitsCount); } if (_identifiedSplitCount != s1._identifiedSplitCount) { return(_identifiedSplitCount > s1._identifiedSplitCount); } return(true); }
/// <summary> /// Performs split specific to this splitter /// </summary> /// <returns>List of splits</returns> private List <SplitWithIdentification> BestSuffixSplit(string identifier) { // find best split for whole word _term = identifier.ToLowerInvariant().ToCharArray(); _splitSetScores = new SplitSetScore[_term.Length]; SplitSetScore wholeWordSplitSetScore = GetBestSplitSetScore(0); return(wholeWordSplitSetScore.GetSplitWithIdentifications(_term)); }
internal List <SplitWithIdentification> GetSplitWithIdentifications(char[] term) { SplitSetScore currentSplitSetScore = this; List <SplitWithIdentification> result = new List <SplitWithIdentification>(); do { SplitWithIdentification splitWithIdentification = new SplitWithIdentification(new string(term, currentSplitSetScore._startIndex, currentSplitSetScore._endIndex - currentSplitSetScore._startIndex + 1), currentSplitSetScore._identification); result.Add(splitWithIdentification); currentSplitSetScore = currentSplitSetScore._nextSplitSetScore; } while (currentSplitSetScore != null); return(result); }
private void AddScore(SplitSetScore s1) { _nextSplitSetScore = s1; // we could jump from score to score to get counts, but since the count is checked very often add them to the parent's count // startIndex and endIndex will be examined from next score if (_nextSplitSetScore == null) { return; } _identifiedSplitCount += _nextSplitSetScore._identifiedSplitCount; _lettersInWordCount += _nextSplitSetScore._lettersInWordCount; _lettersInTokenCount += _nextSplitSetScore._lettersInTokenCount; _totalSplitsCount += _nextSplitSetScore._totalSplitsCount; }
/// <summary> /// Updates Score in array and return it /// </summary> /// <param name="startIndex">Start index of term to calculate split set score</param> /// <param name="endIndex">End index of term to calculate split set score</param> /// <returns>Split Set Score</returns> private SplitSetScore PopulateBestSplitSetScore(int startIndex, int endIndex) { for (int startPosition = endIndex; startPosition >= startIndex; startPosition--) { List <SplitPositionWithIdentification> possibleIndexes = TokenDictionary.GetPossibleEndIndexesList(_term, startPosition, endIndex); SplitSetScore bestSplitSetScore = null; for (int currentSplitPosition = startPosition; currentSplitPosition <= endIndex; currentSplitPosition++) { SplitPositionWithIdentification splitPositionWithIdentification = possibleIndexes.FirstOrDefault(x => x.Position == currentSplitPosition) ?? new SplitPositionWithIdentification(currentSplitPosition, SplitIdentification.Unidentified); SplitSetScore splitSetScore = new SplitSetScore(splitPositionWithIdentification, GetBestSplitSetScore(currentSplitPosition + 1), startPosition); if (splitSetScore.IsBetterThan(bestSplitSetScore)) { bestSplitSetScore = splitSetScore; } } _splitSetScores[startPosition] = bestSplitSetScore; } return(_splitSetScores[startIndex]); }
internal SplitSetScore(SplitPositionWithIdentification splitPositionWithIdentification, SplitSetScore nextSplitSetScore, int startIndex) { _startIndex = startIndex; _endIndex = splitPositionWithIdentification.Position; if (SplitterUtility.IsNotUnidentified(splitPositionWithIdentification.SplitIdentification)) { _identification = splitPositionWithIdentification.SplitIdentification; _identifiedSplitCount = 1; int length = _endIndex - _startIndex + 1; if (_identification == SplitIdentification.Identified) { _lettersInWordCount = length; } else { _lettersInTokenCount = length; } } _totalSplitsCount = 1; AddScore(nextSplitSetScore); }