Пример #1
0
        public static void CreateIndex(IEnumerable <KeyValuePair <Guid, Example> > examples)
        {
            var ex = examples.ToList();

            foreach (var example in ex)
            {
                string lines = GetTextFromExample(example.Value);
                var    terms = GetTerms(lines);

                // Memory optimisation. Store term indices as ushort (16bit)
                if (terms.Count > ushort.MaxValue)
                {
                    throw new InvalidOperationException("Too many terms in this example: " + example.Value.Title);
                }

                var termDictExample = new Dictionary <string, List <ushort> >();
                for (ushort i = 0; i < terms.Count; i++)
                {
                    var term = terms[i];
                    if (termDictExample.ContainsKey(term))
                    {
                        termDictExample[term].Add(i);
                    }
                    else
                    {
                        termDictExample[term] = new List <ushort> {
                            i
                        };
                    }
                }

                var norm = Math.Sqrt(termDictExample.Sum(termDict => Sqr(termDict.Value.Count)));

                foreach (var termDict in termDictExample)
                {
                    var term = termDict.Key;
                    termDict.Value.TrimExcess();

                    if (_invertedIndex.ContainsKey(term))
                    {
                        var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm));
                        _invertedIndex[term].TermInfos.Add(ti);
                    }
                    else
                    {
                        _invertedIndex[term] = new Posting(new List <TermInfo>
                        {
                            new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm))
                        });
                    }
                    _invertedIndex[term].InvertedDocumentFrequency += 1;
                }
            }

            _invertedIndex.ForEachDo(
                x => x.Value.InvertedDocumentFrequency = Math.Log(ex.Count / x.Value.InvertedDocumentFrequency));
        }
Пример #2
0
        public static void ReadIndexFromFile()
        {
            var location = Assembly.GetExecutingAssembly().Location;

            var index = location.IndexOf(@"\bin", StringComparison.InvariantCulture);

            var filePath = location.Substring(0, index) + InvertedIndexRelativePath;

            string[] lines = File.ReadAllLines(filePath);

            _invertedIndex.Clear();
            foreach (var line in lines)
            {
                var splittedLine = line.Split('|');

                string term                 = splittedLine[0];
                var    postings             = splittedLine[1].Split(';');
                var    termFrequencies      = splittedLine[2].Split(',');
                var    invertedDocFrequency = double.Parse(splittedLine[3]);

                var termInfos = new List <TermInfo>();

                for (int i = 0; i < postings.Length; i++)
                {
                    var posting = postings[i];
                    var tf      = double.Parse(termFrequencies[i]);

                    var post        = posting.Split(':');
                    var termEntries = post[1].Split(',').Select(ushort.Parse).ToArray();

                    termInfos.Add(new TermInfo(new Guid(post[0]), termEntries, (float)tf));
                }

                _invertedIndex[term] = new Posting(termInfos)
                {
                    InvertedDocumentFrequency = invertedDocFrequency
                };
            }
        }
Пример #3
0
        public static void CreateIndexForCode(IEnumerable <KeyValuePair <Guid, Example> > examples)
        {
            var ex = examples.ToList();

            foreach (var example in ex)
            {
                var tokenizer = new NGramTokenizer();

                string lines = GetSourceCodeFromExample(example.Value);
                var    terms = lines.ToLower().Split(' ').Where(x => x != "")
                               .Select(tokenizer.Tokenize)
                               .SelectMany(strings => strings.SelectMany(inner => inner))
                               .Select(sb => sb.ToString())
                               .Where(s => !string.IsNullOrEmpty(s) && s.Length > 1)
                               .ToList();

                // Memory optimisation. Store term indices as ushort (16bit)
                if (terms.Count > ushort.MaxValue)
                {
                    throw new InvalidOperationException("Too many code terms for example: " + example.Value.Title);
                }

                var termDictExample = new Dictionary <string, List <ushort> >();
                for (ushort i = 0; i < terms.Count; i++)
                {
                    var term = terms[i];
                    if (termDictExample.ContainsKey(term))
                    {
                        termDictExample[term].Add(i);
                    }
                    else
                    {
                        termDictExample[term] = new List <ushort> {
                            i
                        };
                    }
                }

                var norm = Math.Sqrt(termDictExample.Sum(termDict => Sqr(termDict.Value.Count)));

                foreach (var termDict in termDictExample)
                {
                    var term = termDict.Key;
                    var list = termDict.Value;

                    if (_codeInvertedIndex.ContainsKey(term))
                    {
                        var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm));
                        _codeInvertedIndex[term].TermInfos.Add(ti);
                    }
                    else
                    {
                        var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm));
                        _codeInvertedIndex[term] = new Posting(new List <TermInfo>
                        {
                            ti,
                        });
                    }
                    _codeInvertedIndex[term].InvertedDocumentFrequency += 1;
                }
            }

            _codeInvertedIndex.ForEachDo(x =>
            {
                x.Value.InvertedDocumentFrequency = Math.Log(ex.Count / x.Value.InvertedDocumentFrequency);

                // Collapse memory of List<TermInfo>
                x.Value.TermInfos = x.Value.TermInfos.ToList();
            });
        }