Пример #1
0
 private SearchEngine ChooseApproriateSearchEngine(string query)
 {
     if (query.Equals(Unicode2ASCII.Convert(query)))
     {
         return(this.mySearchEngine2);
     }
     return(this.mySearchEngine);
 }
        // Perform indexing using SPIMI
        // All files generated during the indexing process are stored in SPIMI folder
        // Return value: the inverted index of the collection
        public int Index()
        {
            Console.WriteLine("Index Construction using Single-pass in-memory Indexing");

            // Initialize the dictionary
            Dictionary <string, List <int> > dict = new Dictionary <string, List <int> >(252143);
            // Count the size (in bytes) of the values (term, docId, frequency) added to dictionary and postings lists
            int count = 0;
            // Count the number of blocks written to disk
            int blockId = 0;

            // Delete old files
            string[] filePaths = Directory.GetFiles("..//..//SPIMI//");
            foreach (string filePath in filePaths)
            {
                if (filePath.Contains("index.txt") || filePath.Contains("index_map.txt") || filePath.Contains("index_length.txt"))
                {
                    continue;
                }
                File.Delete(filePath);
            }

            // Count the total number of documents
            int docId = 0;

            // Read and process all documents
            foreach (string docPath in Directory.GetFiles(path))
            {
                using (StreamReader sr = new StreamReader(docPath))
                {
                    string content = sr.ReadToEnd().ToLower();
                    if (content != null)
                    {
                        content = Unicode2ASCII.Convert(content);
                        SPIMI_Invert(content, docId, ref dict, ref count, "SPIMIvi_", ref blockId);
                    }
                    ++docId;
                }
            }
            // Write the remaining postings lists
            if (dict.Count > 0)
            {
                Console.WriteLine("Block " + blockId.ToString() + ": Finished!");
                List <string> sortedTerm = dict.Keys.ToList();
                sortedTerm.Sort();
                WriteBlockToDisk(sortedTerm, dict, "SPIMIvi_" + blockId.ToString());
                ++blockId;
            }

            // Merge all blocks
            Console.WriteLine("Merging blocks...");
            MergeBlocks("SPIMIvi_", blockId, "..//..//SPIMI//indexvi", docId);

            Console.WriteLine("Inverted index successfully constructed.");
            return(docId);
        }
Пример #3
0
        protected override string GetDocContent(int docId)
        {
            string content = "";

            string[] fileNames = Directory.GetFiles(docPath);
            using (StreamReader rd = new StreamReader(fileNames[docId]))
            {
                content = rd.ReadToEnd();
            }
            content = Unicode2ASCII.Convert(content);
            return(content);
        }
        // Read all stopwords from file
        private HashSet <string> GetStopwords()
        {
            IEnumerable <string> stopwords = File.ReadLines(path + "//Stopword//stopwords_vi.txt");

            string[] stopwordarray = stopwords.ToArray <string>();
            for (int i = 0; i < stopwordarray.Length; ++i)
            {
                stopwordarray[i] = Unicode2ASCII.Convert(stopwordarray[i]);
            }

            return(new HashSet <string>(stopwordarray));
        }
Пример #5
0
        protected override Dictionary <string, int> Preprocess(string query)
        {
            query = Unicode2ASCII.Convert(query);
            // Tokenize the query
            MatchCollection words = Tokenizer.TokenizeDoc(query, @"[a-zA-Z]+");

            // Get all terms and their frequencies in the query
            Dictionary <string, int> terms = new Dictionary <string, int>();

            foreach (var word in words)
            {
                string term = word.ToString().ToLower();
                if (terms.ContainsKey(term))
                {
                    ++terms[term];
                }
                else
                {
                    terms.Add(term, 1);
                }
            }

            return(terms);
        }