public static void CreateIndex(IEnumerable <KeyValuePair <Guid, Example> > examples) { var ex = examples.ToList(); foreach (var example in ex) { string lines = GetTextFromExample(example.Value); var terms = GetTerms(lines); // Memory optimisation. Store term indices as ushort (16bit) if (terms.Count > ushort.MaxValue) { throw new InvalidOperationException("Too many terms in this example: " + example.Value.Title); } var termDictExample = new Dictionary <string, List <ushort> >(); for (ushort i = 0; i < terms.Count; i++) { var term = terms[i]; if (termDictExample.ContainsKey(term)) { termDictExample[term].Add(i); } else { termDictExample[term] = new List <ushort> { i }; } } var norm = Math.Sqrt(termDictExample.Sum(termDict => Sqr(termDict.Value.Count))); foreach (var termDict in termDictExample) { var term = termDict.Key; termDict.Value.TrimExcess(); if (_invertedIndex.ContainsKey(term)) { var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm)); _invertedIndex[term].TermInfos.Add(ti); } else { _invertedIndex[term] = new Posting(new List <TermInfo> { new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm)) }); } _invertedIndex[term].InvertedDocumentFrequency += 1; } } _invertedIndex.ForEachDo( x => x.Value.InvertedDocumentFrequency = Math.Log(ex.Count / x.Value.InvertedDocumentFrequency)); }
public static void ReadIndexFromFile() { var location = Assembly.GetExecutingAssembly().Location; var index = location.IndexOf(@"\bin", StringComparison.InvariantCulture); var filePath = location.Substring(0, index) + InvertedIndexRelativePath; string[] lines = File.ReadAllLines(filePath); _invertedIndex.Clear(); foreach (var line in lines) { var splittedLine = line.Split('|'); string term = splittedLine[0]; var postings = splittedLine[1].Split(';'); var termFrequencies = splittedLine[2].Split(','); var invertedDocFrequency = double.Parse(splittedLine[3]); var termInfos = new List <TermInfo>(); for (int i = 0; i < postings.Length; i++) { var posting = postings[i]; var tf = double.Parse(termFrequencies[i]); var post = posting.Split(':'); var termEntries = post[1].Split(',').Select(ushort.Parse).ToArray(); termInfos.Add(new TermInfo(new Guid(post[0]), termEntries, (float)tf)); } _invertedIndex[term] = new Posting(termInfos) { InvertedDocumentFrequency = invertedDocFrequency }; } }
public static void CreateIndexForCode(IEnumerable <KeyValuePair <Guid, Example> > examples) { var ex = examples.ToList(); foreach (var example in ex) { var tokenizer = new NGramTokenizer(); string lines = GetSourceCodeFromExample(example.Value); var terms = lines.ToLower().Split(' ').Where(x => x != "") .Select(tokenizer.Tokenize) .SelectMany(strings => strings.SelectMany(inner => inner)) .Select(sb => sb.ToString()) .Where(s => !string.IsNullOrEmpty(s) && s.Length > 1) .ToList(); // Memory optimisation. Store term indices as ushort (16bit) if (terms.Count > ushort.MaxValue) { throw new InvalidOperationException("Too many code terms for example: " + example.Value.Title); } var termDictExample = new Dictionary <string, List <ushort> >(); for (ushort i = 0; i < terms.Count; i++) { var term = terms[i]; if (termDictExample.ContainsKey(term)) { termDictExample[term].Add(i); } else { termDictExample[term] = new List <ushort> { i }; } } var norm = Math.Sqrt(termDictExample.Sum(termDict => Sqr(termDict.Value.Count))); foreach (var termDict in termDictExample) { var term = termDict.Key; var list = termDict.Value; if (_codeInvertedIndex.ContainsKey(term)) { var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm)); _codeInvertedIndex[term].TermInfos.Add(ti); } else { var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm)); _codeInvertedIndex[term] = new Posting(new List <TermInfo> { ti, }); } _codeInvertedIndex[term].InvertedDocumentFrequency += 1; } } _codeInvertedIndex.ForEachDo(x => { x.Value.InvertedDocumentFrequency = Math.Log(ex.Count / x.Value.InvertedDocumentFrequency); // Collapse memory of List<TermInfo> x.Value.TermInfos = x.Value.TermInfos.ToList(); }); }