public async Task <Tuple <long, List <ITermDocument> > > simulateIndexer(String path) { var stemmer = new Stemmer(); Console.WriteLine("ree"); var reader = new StreamReader(Path.Combine(await _corpus.GetRepository(), path)); var list = new List <string>(Regex.Split(Regex.Replace( reader.ReadToEnd(), "[^a-zA-Z0-9']+", " " ).Trim(), "\\s+")); reader.Close(); //Document Length long length = list.Count; //Adding Term's Path for indexing. list.AddRange(Regex.Split(Regex.Replace( path, "[^a-zA-Z0-9']", " " ).Trim(), "\\s+")); var termDict = new Dictionary <string, ITermDocument>(); var docDict = new Dictionary <string, IDocumentReference>(); for (int i = 0; i < list.Count; i++) { var word = list[i].ToLower(); var rootWord = stemmer.StemWord(word); ITermDocument term; if (termDict.Keys.Contains(rootWord)) { term = termDict[rootWord]; } else { term = new TermDocument(rootWord); termDict.Add(rootWord, term); } IDocumentReference docref; if (docDict.Keys.Contains(word)) { docref = docDict[word]; } else { docref = new DocumentReference("", word); docDict.Add(word, docref); term.addDoc(docref); } docref.addPos(i); } return(Tuple.Create(length, termDict.Values.ToList())); }
/// <summary> /// Generates the token. /// </summary> /// <param name="text">This is the string which is passed into the method.</param> public Tuple <int, List <ITermDocument> > Tokenize(string text) { var termDict = new Dictionary <string, ITermDocument>(); var docDict = new Dictionary <string, IDocumentReference>(); var stripped_text = strip(text); //Document Length var length = stripped_text.Length; for (int i = 0; i < stripped_text.Length; i++) //Loop through the list { var word = stripped_text[i]; //Remove stopwords if (_stopWords.Contains(stripped_text[i])) { length -= 1; continue; } var rootWord = _stemmer.StemWord(word.ToLower()); ITermDocument term; if (termDict.ContainsKey(rootWord)) //Check if the word has already been added to the dictionary { term = termDict[rootWord]; } else //Add words to the dictionary { term = new TermDocument(rootWord); termDict.Add(rootWord, term); } IDocumentReference docref; if (docDict.ContainsKey(word)) //Check if document reference contains word { docref = docDict[word]; } else { docref = new DocumentReference("", word); docDict.Add(word, docref); term.addDoc(docref); } docref.addPos(i); } return(Tuple.Create(length, termDict.Values.ToList())); }