private static void AddTermsFromPlainText(ISet <string> target, string text) { if (string.IsNullOrEmpty(text)) { return; } var tokens = TextTokenizer.Tokenize(text); target.UnionWith(tokens); }
private static void Apply(IssueFilter result, BoundTextQuery expression) { var terms = TextTokenizer.Tokenize(expression.Text); foreach (var term in terms) { if (expression.IsNegated) { result.ExcludedTerms.Add(term); } else { result.IncludedTerms.Add(term); } } }
private static IEnumerable<string> GetFileTokens(string fileName, bool ignoreCase) { string text = null; try { text = File.ReadAllText(fileName); } catch (IOException ex) { Debug.WriteLine(ex.Message + ex.StackTrace); } if (text != null) { var tokenizer = new TextTokenizer(text); var tokens = tokenizer.Tokenize(); var filteredWords = tokens.Select(token => ignoreCase ? token.ToLower() : token); foreach (var filteredWord in filteredWords) { yield return filteredWord; } } }
/// <summary> /// Performs clustering for the selected language using the current parameters. /// </summary> private void Cluster() { try { TernaryIndex = new TernaryIndex(ClusterVectorSize, ClusterVectorDensity); Parallel.ForEach(DocFiles, docFile => { string text; try { text = File.ReadAllText(Path.Combine(CommonFiles.DocsPath(LanguageId), docFile)); } catch (IOException ex) { Debug.WriteLine(ex.Message + ex.StackTrace); return; } var tokenizer = new TextTokenizer(text); var tokens = tokenizer.Tokenize(); TernaryIndex.ReadSequence(tokens, ClusterPreWindow, ClusterPostWindow); }); TernaryIndex.SimilarityIndex.AddRange(TernaryIndex.BaseWords.ToArray()); //TernaryIndex.SimilarityIndex.BuildIndex(TernaryIndex.Contexts.Cast<ITernaryVector>().ToArray()); } catch (Exception ex) { MessageBox.Show(ex.Message + ex.StackTrace); } }
private static List <IToken> Tokenize(string text) { ITextTokenizer tokenizer = new TextTokenizer(new TextHelper()); return(tokenizer.Tokenize(text).ToList()); }