public async Task <Tuple <long, List <ITermDocument> > > simulateIndexer(String path) { var stemmer = new Stemmer(); Console.WriteLine("ree"); var reader = new StreamReader(Path.Combine(await _corpus.GetRepository(), path)); var list = new List <string>(Regex.Split(Regex.Replace( reader.ReadToEnd(), "[^a-zA-Z0-9']+", " " ).Trim(), "\\s+")); reader.Close(); //Document Length long length = list.Count; //Adding Term's Path for indexing. list.AddRange(Regex.Split(Regex.Replace( path, "[^a-zA-Z0-9']", " " ).Trim(), "\\s+")); var termDict = new Dictionary <string, ITermDocument>(); var docDict = new Dictionary <string, IDocumentReference>(); for (int i = 0; i < list.Count; i++) { var word = list[i].ToLower(); var rootWord = stemmer.StemWord(word); ITermDocument term; if (termDict.Keys.Contains(rootWord)) { term = termDict[rootWord]; } else { term = new TermDocument(rootWord); termDict.Add(rootWord, term); } IDocumentReference docref; if (docDict.Keys.Contains(word)) { docref = docDict[word]; } else { docref = new DocumentReference("", word); docDict.Add(word, docref); term.addDoc(docref); } docref.addPos(i); } return(Tuple.Create(length, termDict.Values.ToList())); }
/// <summary> /// Generates the token. /// </summary> /// <param name="text">This is the string which is passed into the method.</param> public Tuple <int, List <ITermDocument> > Tokenize(string text) { var termDict = new Dictionary <string, ITermDocument>(); var docDict = new Dictionary <string, IDocumentReference>(); var stripped_text = strip(text); //Document Length var length = stripped_text.Length; for (int i = 0; i < stripped_text.Length; i++) //Loop through the list { var word = stripped_text[i]; //Remove stopwords if (_stopWords.Contains(stripped_text[i])) { length -= 1; continue; } var rootWord = _stemmer.StemWord(word.ToLower()); ITermDocument term; if (termDict.ContainsKey(rootWord)) //Check if the word has already been added to the dictionary { term = termDict[rootWord]; } else //Add words to the dictionary { term = new TermDocument(rootWord); termDict.Add(rootWord, term); } IDocumentReference docref; if (docDict.ContainsKey(word)) //Check if document reference contains word { docref = docDict[word]; } else { docref = new DocumentReference("", word); docDict.Add(word, docref); term.addDoc(docref); } docref.addPos(i); } return(Tuple.Create(length, termDict.Values.ToList())); }
public async Task Default() { _database.DropCollection("Root"); _database.DropCollection("DocumentsTable"); _database.DropCollection("InvertedTable"); var doc = new BsonDocument { { "_id", new ObjectId("1") }, { "path", "1.txt" }, { "length", long.Parse("6") } }; var doc1 = new BsonDocument { { "_id", new ObjectId("2") }, { "path", "2.txt" }, { "length", long.Parse("4") } }; await _documentsTable.InsertOneAsync(doc); await _documentsTable.InsertOneAsync(doc1); doc = new BsonDocument { { "_id", "1" }, { "no_docs", 2 }, { "repo", "C:/Users/kooldeji/Documents/repo" }, { "crawled", long.Parse("0") } }; await _root.InsertOneAsync(doc); var index = new List <KeyValuePair <string, string> >(); var termdict = new Dictionary <string, TermDocument>(); var docdict = new Dictionary <string, DocumentReference>(); var id = "1"; index.Add(new KeyValuePair <string, string>("life", "life")); index.Add(new KeyValuePair <string, string>("is", "is")); index.Add(new KeyValuePair <string, string>("learn", "learning")); index.Add(new KeyValuePair <string, string>("with", "with")); index.Add(new KeyValuePair <string, string>("learn", "learners")); var c = 0; foreach (var item in index) { c += 1; if (item.Key.Count() <= 2) { continue; } TermDocument term; if (termdict.Keys.Contains(item.Key)) { term = termdict[item.Key]; } else { term = new TermDocument(item.Key); termdict.Add(item.Key, term); } DocumentReference docref; if (docdict.Keys.Contains(item.Key + item.Value)) { docref = docdict[item.Key + item.Value]; } else { docref = new DocumentReference(id, item.Value); term.addDoc(docref); } docref.addPos(c); } id = "2"; docdict = new Dictionary <string, DocumentReference>(); index.Clear(); index.Add(new KeyValuePair <string, string>("learn", "learning")); index.Add(new KeyValuePair <string, string>("is", "is")); index.Add(new KeyValuePair <string, string>("very", "very")); index.Add(new KeyValuePair <string, string>("good", "good")); index.Add(new KeyValuePair <string, string>("infact", "infact")); index.Add(new KeyValuePair <string, string>("learn", "learning")); index.Add(new KeyValuePair <string, string>("is", "is")); index.Add(new KeyValuePair <string, string>("awesome", "awesome")); foreach (var item in index) { c += 1; if (item.Key.Count() <= 2) { continue; } TermDocument term; if (termdict.Keys.Contains(item.Key)) { term = termdict[item.Key]; } else { term = new TermDocument(item.Key); termdict.Add(item.Key, term); } DocumentReference docref; if (docdict.Keys.Contains(item.Key + item.Value)) { docref = docdict[item.Key + item.Value]; } else { docref = new DocumentReference("", item.Value); docdict.Add(item.Key + item.Value, docref); term.addDoc(docref); } docref.addPos(c); } await _invertedTable.InsertManyAsync(termdict.Values); }