/// <summary> /// This method exists in order to avoid recursive calls to the method /// as the complexity of a fairly small matrix then easily would require /// a gigabyte sized stack per thread. /// </summary> /// <param name="reusableToken"></param> /// <returns>null if exhausted, instance request_next_token if one more call is required for an answer, /// or instance parameter resuableToken.</returns> private Token ProduceNextToken(Token reusableToken) { if (_currentPermuationTokens != null) { _currentShingleLength++; if (_currentShingleLength + _currentPermutationTokensStartOffset <= _currentPermuationTokens.Count && _currentShingleLength <= MaximumShingleSize) { // it is possible to create at least one more shingle of the current matrix permutation if (IsIgnoringSinglePrefixOrSuffixShingle && _currentShingleLength == 1 && (_currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsFirst || _currentPermutationRows[_currentPermutationTokensStartOffset].Column.IsLast)) { return Next(); } var termLength = 0; var shingle = new EquatableList<Token>(); for (int i = 0; i < _currentShingleLength; i++) { var shingleToken = _currentPermuationTokens[i + _currentPermutationTokensStartOffset]; termLength += shingleToken.TermLength(); shingle.Add(shingleToken); } if (SpacerCharacter != null) termLength += _currentShingleLength - 1; // only produce shingles that not already has been created if (!_shinglesSeen.Add(shingle)) return _requestNextToken; // shingle token factory var sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. ;) foreach (var shingleToken in shingle) { if (SpacerCharacter != null && sb.Length > 0) sb.Append(SpacerCharacter); sb.Append(shingleToken.TermBuffer(), 0, shingleToken.TermLength()); } reusableToken.SetTermBuffer(sb.ToString()); UpdateToken(reusableToken, shingle, _currentPermutationTokensStartOffset, _currentPermutationRows, _currentPermuationTokens); return reusableToken; } // it is NOT possible to create one more shingles of the current matrix permutation if (_currentPermutationTokensStartOffset < _currentPermuationTokens.Count - 1) { // reset shingle size and move one step to the right in the current tokens permutation _currentPermutationTokensStartOffset++; _currentShingleLength = MinimumShingleSize - 1; return _requestNextToken; } // todo does this ever occur? if (_permutations == null) return null; if (!_permutations.HasNext()) { // load more data (if available) to the matrix // don't really care, we just read it. if (_input != null) ReadColumn(); // get rid of resources // delete the first column in the matrix var deletedColumn = Matrix.Columns[0]; Matrix.Columns.RemoveAt(0); // remove all shingles seen that include any of the tokens from the deleted column. var deletedColumnTokens = deletedColumn.Rows.SelectMany(row => row.Tokens).ToList(); // I'm a little concerned about this part of the code, because the unit tests currently // don't cover this scenario. (I put a break point here, and ran the unit tests in debug mode // and this code block was never hit... I also changed it significatly from the Java version // to use RemoveWhere and LINQ. // // TODO: Write a unit test to cover this and make sure this is a good port! -thoward // linq version _shinglesSeen.RemoveWhere( shingle => (shingle.Find(deletedColumnTokens.Contains) != default(Token))); //// initial conversion //var shinglesSeenIterator = _shinglesSeen.ToList(); //foreach (var shingle in shinglesSeenIterator) //{ // foreach (var deletedColumnToken in deletedColumnTokens) // { // if (shingle.Contains(deletedColumnToken)) // { // _shinglesSeen.Remove(shingle); // break; // } // } //} // exhausted if (Matrix.Columns.Count < MinimumShingleSize) return null; // create permutations of the matrix it now looks _permutations = Matrix.PermutationIterator(); } NextTokensPermutation(); return _requestNextToken; } if (_permutations == null) _permutations = Matrix.PermutationIterator(); if (!_permutations.HasNext()) return null; NextTokensPermutation(); return _requestNextToken; }
public override void Reset() { _permutations = null; _shinglesSeen.Clear(); _input.Reset(); }