public static IList <string> GetTerms(string text) { text = text.ToLower(); text = new Regex(@"\W").Replace(text, " "); var words = text.Split(' ').Where(x => x != "").Where(word => !_stopWords.Contains(word)).ToArray(); var tokenizer = new NGramTokenizer(); var terms = words .Select(tokenizer.Tokenize) .SelectMany(strings => strings.SelectMany(inner => inner)) .Select(sb => sb.ToString()) .Where(s => !string.IsNullOrEmpty(s) && s.Length > 1) .ToList(); return(terms); }
public static void CreateIndexForCode(IEnumerable <KeyValuePair <Guid, Example> > examples) { var ex = examples.ToList(); foreach (var example in ex) { var tokenizer = new NGramTokenizer(); string lines = GetSourceCodeFromExample(example.Value); var terms = lines.ToLower().Split(' ').Where(x => x != "") .Select(tokenizer.Tokenize) .SelectMany(strings => strings.SelectMany(inner => inner)) .Select(sb => sb.ToString()) .Where(s => !string.IsNullOrEmpty(s) && s.Length > 1) .ToList(); // Memory optimisation. Store term indices as ushort (16bit) if (terms.Count > ushort.MaxValue) { throw new InvalidOperationException("Too many code terms for example: " + example.Value.Title); } var termDictExample = new Dictionary <string, List <ushort> >(); for (ushort i = 0; i < terms.Count; i++) { var term = terms[i]; if (termDictExample.ContainsKey(term)) { termDictExample[term].Add(i); } else { termDictExample[term] = new List <ushort> { i }; } } var norm = Math.Sqrt(termDictExample.Sum(termDict => Sqr(termDict.Value.Count))); foreach (var termDict in termDictExample) { var term = termDict.Key; var list = termDict.Value; if (_codeInvertedIndex.ContainsKey(term)) { var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm)); _codeInvertedIndex[term].TermInfos.Add(ti); } else { var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm)); _codeInvertedIndex[term] = new Posting(new List <TermInfo> { ti, }); } _codeInvertedIndex[term].InvertedDocumentFrequency += 1; } } _codeInvertedIndex.ForEachDo(x => { x.Value.InvertedDocumentFrequency = Math.Log(ex.Count / x.Value.InvertedDocumentFrequency); // Collapse memory of List<TermInfo> x.Value.TermInfos = x.Value.TermInfos.ToList(); }); }