Example #1
0
        public async Task <Tuple <long, List <ITermDocument> > > simulateIndexer(String path)
        {
            var stemmer = new Stemmer();

            Console.WriteLine("ree");
            var reader = new StreamReader(Path.Combine(await _corpus.GetRepository(), path));
            var list   = new List <string>(Regex.Split(Regex.Replace(
                                                           reader.ReadToEnd(),
                                                           "[^a-zA-Z0-9']+",
                                                           " "
                                                           ).Trim(), "\\s+"));

            reader.Close();

            //Document Length
            long length = list.Count;

            //Adding Term's Path for indexing.
            list.AddRange(Regex.Split(Regex.Replace(
                                          path,
                                          "[^a-zA-Z0-9']",
                                          " "
                                          ).Trim(), "\\s+"));

            var termDict = new Dictionary <string, ITermDocument>();
            var docDict  = new Dictionary <string, IDocumentReference>();

            for (int i = 0; i < list.Count; i++)
            {
                var           word     = list[i].ToLower();
                var           rootWord = stemmer.StemWord(word);
                ITermDocument term;
                if (termDict.Keys.Contains(rootWord))
                {
                    term = termDict[rootWord];
                }
                else
                {
                    term = new TermDocument(rootWord);
                    termDict.Add(rootWord, term);
                }
                IDocumentReference docref;
                if (docDict.Keys.Contains(word))
                {
                    docref = docDict[word];
                }
                else
                {
                    docref = new DocumentReference("", word);
                    docDict.Add(word, docref);
                    term.addDoc(docref);
                }
                docref.addPos(i);
            }
            return(Tuple.Create(length, termDict.Values.ToList()));
        }
Example #2
0
        /// <summary>
        /// Generates the token.
        /// </summary>
        /// <param name="text">This is the string which is passed into the method.</param>
        public Tuple <int, List <ITermDocument> > Tokenize(string text)
        {
            var termDict      = new Dictionary <string, ITermDocument>();
            var docDict       = new Dictionary <string, IDocumentReference>();
            var stripped_text = strip(text);

            //Document Length
            var length = stripped_text.Length;


            for (int i = 0; i < stripped_text.Length; i++)  //Loop through the list
            {
                var word = stripped_text[i];

                //Remove stopwords
                if (_stopWords.Contains(stripped_text[i]))
                {
                    length -= 1;
                    continue;
                }

                var           rootWord = _stemmer.StemWord(word.ToLower());
                ITermDocument term;
                if (termDict.ContainsKey(rootWord))       //Check if the word has already been added to the dictionary
                {
                    term = termDict[rootWord];
                }
                else                                        //Add words to the dictionary
                {
                    term = new TermDocument(rootWord);
                    termDict.Add(rootWord, term);
                }
                IDocumentReference docref;
                if (docDict.ContainsKey(word))            //Check if document reference contains word
                {
                    docref = docDict[word];
                }
                else
                {
                    docref = new DocumentReference("", word);
                    docDict.Add(word, docref);
                    term.addDoc(docref);
                }
                docref.addPos(i);
            }
            return(Tuple.Create(length, termDict.Values.ToList()));
        }