Example #1
0
        public static IList <string> GetTerms(string text)
        {
            text = text.ToLower();
            text = new Regex(@"\W").Replace(text, " ");

            var words = text.Split(' ').Where(x => x != "").Where(word => !_stopWords.Contains(word)).ToArray();

            var tokenizer = new NGramTokenizer();
            var terms     = words
                            .Select(tokenizer.Tokenize)
                            .SelectMany(strings => strings.SelectMany(inner => inner))
                            .Select(sb => sb.ToString())
                            .Where(s => !string.IsNullOrEmpty(s) && s.Length > 1)
                            .ToList();

            return(terms);
        }
Example #2
0
        public static void CreateIndexForCode(IEnumerable <KeyValuePair <Guid, Example> > examples)
        {
            var ex = examples.ToList();

            foreach (var example in ex)
            {
                var tokenizer = new NGramTokenizer();

                string lines = GetSourceCodeFromExample(example.Value);
                var    terms = lines.ToLower().Split(' ').Where(x => x != "")
                               .Select(tokenizer.Tokenize)
                               .SelectMany(strings => strings.SelectMany(inner => inner))
                               .Select(sb => sb.ToString())
                               .Where(s => !string.IsNullOrEmpty(s) && s.Length > 1)
                               .ToList();

                // Memory optimisation. Store term indices as ushort (16bit)
                if (terms.Count > ushort.MaxValue)
                {
                    throw new InvalidOperationException("Too many code terms for example: " + example.Value.Title);
                }

                var termDictExample = new Dictionary <string, List <ushort> >();
                for (ushort i = 0; i < terms.Count; i++)
                {
                    var term = terms[i];
                    if (termDictExample.ContainsKey(term))
                    {
                        termDictExample[term].Add(i);
                    }
                    else
                    {
                        termDictExample[term] = new List <ushort> {
                            i
                        };
                    }
                }

                var norm = Math.Sqrt(termDictExample.Sum(termDict => Sqr(termDict.Value.Count)));

                foreach (var termDict in termDictExample)
                {
                    var term = termDict.Key;
                    var list = termDict.Value;

                    if (_codeInvertedIndex.ContainsKey(term))
                    {
                        var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm));
                        _codeInvertedIndex[term].TermInfos.Add(ti);
                    }
                    else
                    {
                        var ti = new TermInfo(example.Key, termDict.Value.ToArray(), (float)(termDict.Value.Count / norm));
                        _codeInvertedIndex[term] = new Posting(new List <TermInfo>
                        {
                            ti,
                        });
                    }
                    _codeInvertedIndex[term].InvertedDocumentFrequency += 1;
                }
            }

            _codeInvertedIndex.ForEachDo(x =>
            {
                x.Value.InvertedDocumentFrequency = Math.Log(ex.Count / x.Value.InvertedDocumentFrequency);

                // Collapse memory of List<TermInfo>
                x.Value.TermInfos = x.Value.TermInfos.ToList();
            });
        }