예제 #1
0
파일: Crawler.cs 프로젝트: Kooldeji/WebSpy
        public async Task <Tuple <long, List <ITermDocument> > > simulateIndexer(String path)
        {
            var stemmer = new Stemmer();

            Console.WriteLine("ree");
            var reader = new StreamReader(Path.Combine(await _corpus.GetRepository(), path));
            var list   = new List <string>(Regex.Split(Regex.Replace(
                                                           reader.ReadToEnd(),
                                                           "[^a-zA-Z0-9']+",
                                                           " "
                                                           ).Trim(), "\\s+"));

            reader.Close();

            //Document Length
            long length = list.Count;

            //Adding Term's Path for indexing.
            list.AddRange(Regex.Split(Regex.Replace(
                                          path,
                                          "[^a-zA-Z0-9']",
                                          " "
                                          ).Trim(), "\\s+"));

            var termDict = new Dictionary <string, ITermDocument>();
            var docDict  = new Dictionary <string, IDocumentReference>();

            for (int i = 0; i < list.Count; i++)
            {
                var           word     = list[i].ToLower();
                var           rootWord = stemmer.StemWord(word);
                ITermDocument term;
                if (termDict.Keys.Contains(rootWord))
                {
                    term = termDict[rootWord];
                }
                else
                {
                    term = new TermDocument(rootWord);
                    termDict.Add(rootWord, term);
                }
                IDocumentReference docref;
                if (docDict.Keys.Contains(word))
                {
                    docref = docDict[word];
                }
                else
                {
                    docref = new DocumentReference("", word);
                    docDict.Add(word, docref);
                    term.addDoc(docref);
                }
                docref.addPos(i);
            }
            return(Tuple.Create(length, termDict.Values.ToList()));
        }
예제 #2
0
        /// <summary>
        /// Generates the token.
        /// </summary>
        /// <param name="text">This is the string which is passed into the method.</param>
        public Tuple <int, List <ITermDocument> > Tokenize(string text)
        {
            var termDict      = new Dictionary <string, ITermDocument>();
            var docDict       = new Dictionary <string, IDocumentReference>();
            var stripped_text = strip(text);

            //Document Length
            var length = stripped_text.Length;


            for (int i = 0; i < stripped_text.Length; i++)  //Loop through the list
            {
                var word = stripped_text[i];

                //Remove stopwords
                if (_stopWords.Contains(stripped_text[i]))
                {
                    length -= 1;
                    continue;
                }

                var           rootWord = _stemmer.StemWord(word.ToLower());
                ITermDocument term;
                if (termDict.ContainsKey(rootWord))       //Check if the word has already been added to the dictionary
                {
                    term = termDict[rootWord];
                }
                else                                        //Add words to the dictionary
                {
                    term = new TermDocument(rootWord);
                    termDict.Add(rootWord, term);
                }
                IDocumentReference docref;
                if (docDict.ContainsKey(word))            //Check if document reference contains word
                {
                    docref = docDict[word];
                }
                else
                {
                    docref = new DocumentReference("", word);
                    docDict.Add(word, docref);
                    term.addDoc(docref);
                }
                docref.addPos(i);
            }
            return(Tuple.Create(length, termDict.Values.ToList()));
        }
예제 #3
0
파일: Corpus.cs 프로젝트: Kooldeji/WebSpy
        public async Task Default()
        {
            _database.DropCollection("Root");
            _database.DropCollection("DocumentsTable");
            _database.DropCollection("InvertedTable");
            var doc = new BsonDocument
            {
                { "_id", new ObjectId("1") },
                { "path", "1.txt" },
                { "length", long.Parse("6") }
            };
            var doc1 = new BsonDocument
            {
                { "_id", new ObjectId("2") },
                { "path", "2.txt" },
                { "length", long.Parse("4") }
            };
            await _documentsTable.InsertOneAsync(doc);

            await _documentsTable.InsertOneAsync(doc1);

            doc = new BsonDocument
            {
                { "_id", "1" },
                { "no_docs", 2 },
                { "repo", "C:/Users/kooldeji/Documents/repo" },
                { "crawled", long.Parse("0") }
            };
            await _root.InsertOneAsync(doc);

            var index    = new List <KeyValuePair <string, string> >();
            var termdict = new Dictionary <string, TermDocument>();
            var docdict  = new Dictionary <string, DocumentReference>();
            var id       = "1";

            index.Add(new KeyValuePair <string, string>("life", "life"));
            index.Add(new KeyValuePair <string, string>("is", "is"));
            index.Add(new KeyValuePair <string, string>("learn", "learning"));
            index.Add(new KeyValuePair <string, string>("with", "with"));
            index.Add(new KeyValuePair <string, string>("learn", "learners"));
            var c = 0;

            foreach (var item in index)
            {
                c += 1;
                if (item.Key.Count() <= 2)
                {
                    continue;
                }
                TermDocument term;
                if (termdict.Keys.Contains(item.Key))
                {
                    term = termdict[item.Key];
                }
                else
                {
                    term = new TermDocument(item.Key);
                    termdict.Add(item.Key, term);
                }
                DocumentReference docref;
                if (docdict.Keys.Contains(item.Key + item.Value))
                {
                    docref = docdict[item.Key + item.Value];
                }
                else
                {
                    docref = new DocumentReference(id, item.Value);
                    term.addDoc(docref);
                }
                docref.addPos(c);
            }
            id      = "2";
            docdict = new Dictionary <string, DocumentReference>();
            index.Clear();
            index.Add(new KeyValuePair <string, string>("learn", "learning"));
            index.Add(new KeyValuePair <string, string>("is", "is"));
            index.Add(new KeyValuePair <string, string>("very", "very"));
            index.Add(new KeyValuePair <string, string>("good", "good"));
            index.Add(new KeyValuePair <string, string>("infact", "infact"));
            index.Add(new KeyValuePair <string, string>("learn", "learning"));
            index.Add(new KeyValuePair <string, string>("is", "is"));
            index.Add(new KeyValuePair <string, string>("awesome", "awesome"));
            foreach (var item in index)
            {
                c += 1;
                if (item.Key.Count() <= 2)
                {
                    continue;
                }
                TermDocument term;
                if (termdict.Keys.Contains(item.Key))
                {
                    term = termdict[item.Key];
                }
                else
                {
                    term = new TermDocument(item.Key);
                    termdict.Add(item.Key, term);
                }
                DocumentReference docref;
                if (docdict.Keys.Contains(item.Key + item.Value))
                {
                    docref = docdict[item.Key + item.Value];
                }
                else
                {
                    docref = new DocumentReference("", item.Value);
                    docdict.Add(item.Key + item.Value, docref);
                    term.addDoc(docref);
                }
                docref.addPos(c);
            }
            await _invertedTable.InsertManyAsync(termdict.Values);
        }