private SearchEngine ChooseApproriateSearchEngine(string query) { if (query.Equals(Unicode2ASCII.Convert(query))) { return(this.mySearchEngine2); } return(this.mySearchEngine); }
// Perform indexing using SPIMI // All files generated during the indexing process are stored in SPIMI folder // Return value: the inverted index of the collection public int Index() { Console.WriteLine("Index Construction using Single-pass in-memory Indexing"); // Initialize the dictionary Dictionary <string, List <int> > dict = new Dictionary <string, List <int> >(252143); // Count the size (in bytes) of the values (term, docId, frequency) added to dictionary and postings lists int count = 0; // Count the number of blocks written to disk int blockId = 0; // Delete old files string[] filePaths = Directory.GetFiles("..//..//SPIMI//"); foreach (string filePath in filePaths) { if (filePath.Contains("index.txt") || filePath.Contains("index_map.txt") || filePath.Contains("index_length.txt")) { continue; } File.Delete(filePath); } // Count the total number of documents int docId = 0; // Read and process all documents foreach (string docPath in Directory.GetFiles(path)) { using (StreamReader sr = new StreamReader(docPath)) { string content = sr.ReadToEnd().ToLower(); if (content != null) { content = Unicode2ASCII.Convert(content); SPIMI_Invert(content, docId, ref dict, ref count, "SPIMIvi_", ref blockId); } ++docId; } } // Write the remaining postings lists if (dict.Count > 0) { Console.WriteLine("Block " + blockId.ToString() + ": Finished!"); List <string> sortedTerm = dict.Keys.ToList(); sortedTerm.Sort(); WriteBlockToDisk(sortedTerm, dict, "SPIMIvi_" + blockId.ToString()); ++blockId; } // Merge all blocks Console.WriteLine("Merging blocks..."); MergeBlocks("SPIMIvi_", blockId, "..//..//SPIMI//indexvi", docId); Console.WriteLine("Inverted index successfully constructed."); return(docId); }
protected override string GetDocContent(int docId) { string content = ""; string[] fileNames = Directory.GetFiles(docPath); using (StreamReader rd = new StreamReader(fileNames[docId])) { content = rd.ReadToEnd(); } content = Unicode2ASCII.Convert(content); return(content); }
// Read all stopwords from file private HashSet <string> GetStopwords() { IEnumerable <string> stopwords = File.ReadLines(path + "//Stopword//stopwords_vi.txt"); string[] stopwordarray = stopwords.ToArray <string>(); for (int i = 0; i < stopwordarray.Length; ++i) { stopwordarray[i] = Unicode2ASCII.Convert(stopwordarray[i]); } return(new HashSet <string>(stopwordarray)); }
protected override Dictionary <string, int> Preprocess(string query) { query = Unicode2ASCII.Convert(query); // Tokenize the query MatchCollection words = Tokenizer.TokenizeDoc(query, @"[a-zA-Z]+"); // Get all terms and their frequencies in the query Dictionary <string, int> terms = new Dictionary <string, int>(); foreach (var word in words) { string term = word.ToString().ToLower(); if (terms.ContainsKey(term)) { ++terms[term]; } else { terms.Add(term, 1); } } return(terms); }