public string start_search(string query_input)
        {
            StringBuilder to_output = new StringBuilder();

            String query = query_input;

            String[] query_list;

            double[]      scores    = new double[docs.Count + 1];
            List <string> query_tok = new List <string>();

            query = query.Replace("<.*?>", " ");
            query = query.Replace("-", " ");
            query = query.Replace(", ", " ");
            query = query.Replace("; ", " ");
            query = query.Replace("\\? ", " ");
            query = query.Replace(": ", " ");
            query = query.Replace("! ", " ");
            query = query.Replace("\\. ", " ");
            query = query.Replace("\\.\"|\\.'", " ");
            query = query.Replace(" +", " ");

            query = query.Replace(" [a-z] | [A-Z] ", " ");

            query_list = null;
            query_tok.Clear();
            query_list = query.Split(' ');

            //tokenize the input query
            for (int i = 0; i < query_list.Length; i++)
            {
                query_list[i] = query_list[i].ToLower();

                query_list[i] = query_list[i].ToLower();
                query_list[i] = query_list[i].Replace("^\\[|\\]$", "");
                query_list[i] = query_list[i].Replace("^\\(|\\)$", "");
                query_list[i] = query_list[i].Replace("^'|'$", "");
                query_list[i] = query_list[i].Replace("'", "");
                query_list[i] = query_list[i].Replace("^\"|\"$", "");
                query_list[i] = query_list[i].Replace(",", "");

                query_list[i] = query_list[i].Trim();

                if (query_list[i].EndsWith("ies"))
                {
                    if (!(query_list[i].EndsWith("aies")) && !(query_list[i].EndsWith("eies")))
                    {
                        query_list[i] = query_list[i].Replace("ies$", "y");
                    }
                }

                if (query_list[i].EndsWith("es"))
                {
                    if (!(query_list[i].EndsWith("aes")) && !(query_list[i].EndsWith("ees")) && !(query_list[i].EndsWith("oes")))
                    {
                        query_list[i] = query_list[i].Replace("es$", "e");
                    }
                }

                if (query_list[i].EndsWith("s"))
                {
                    if (!(query_list[i].EndsWith("us")) && !(query_list[i].EndsWith("ss")))
                    {
                        query_list[i] = query_list[i].Replace("s$", "");
                    }
                }

                if ((!(query_list[i].Length == 1)) && (!query_list[i].Equals("")) && (!query_list[i].Equals("and")) && (!query_list[i].Equals("an")) && (!query_list[i].Equals("by")) && (!query_list[i].Equals("from")) && (!query_list[i].Equals("of")) && (!query_list[i].Equals("the")) && (!query_list[i].Equals("with")) && (!query_list[i].Equals("a")) && (!query_list[i].Equals("in")))
                {
                    query_list[i] = query_list[i].Replace("[^A-Za-z0-9]", "");

                    if ((!(query_list[i].Length == 1)) && (!query_list[i].Equals("")))
                    {
                        query_tok.Add(query_list[i]);
                    }
                }
            }

            List <int> docs_res = new List <int>();

            foreach (string tok in query_tok)
            {
                if (dict.ContainsKey(tok))
                {
                    SearchDictionary d  = dict[tok];
                    int[]            tf = new int[docs.Count + 1];

                    for (int x = 0; x < d.df; x++)
                    {
                        Posting p = post[d.offset + x];
                        tf[p.docid] = p.tf;

                        if (!docs_res.Contains(p.docid))
                        {
                            docs_res.Add(p.docid);
                        }
                    }

                    for (int y = 1; y < tf.Length; y++)
                    {
                        Docs_Info doc = docs[y];

                        scores[y] += Math.Log10((0.9 * ((double)tf[y] / (double)doc.doclength) + (0.1 * (double)d.cf / (double)collectionSize))) / (double)Math.Log10(2);
                        if (docs_res.Contains(y))
                        {
                            scores_m[y] = scores[y];
                        }
                    }
                }
            }

            var top5 = scores_m.OrderByDescending(pair => pair.Value).Take(5).ToDictionary(pair => pair.Key, pair => pair.Value);

            int count   = 0;
            var newline = Environment.NewLine;

            foreach (var x in top5)
            {
                int       id    = x.Key;
                Docs_Info doc_r = docs[id];

                to_output.Append(docs[id].headline + newline + doc_r.docpath + Environment.NewLine + "Computed probability: " + x.Value + newline);
                to_output.Append(doc_r.snippet + newline + newline);

                count = count + 1;

                if (count == 5)
                {
                    break;
                }
            }

            if (scores_m.Count == 0)
            {
                to_output.Append("NO RESULTs" + newline);
            }

            scores_m.Clear();
            return(to_output.ToString());
        }
Beispiel #2
0
        public void IOFile_Process(String filename)
        {
            bool isDict      = false;
            bool isPosting   = false;
            bool isDocsTable = false;
            bool isTotal     = false;

            if (filename.Equals("..\\..\\SEDocumentExtraction\\dictionary.csv"))
            {
                isDict = true;
            }
            else if (filename.Equals("..\\..\\SEDocumentExtraction\\postings.csv"))
            {
                isPosting = true;
            }
            else if (filename.Equals("..\\..\\SEDocumentExtraction\\docsTable.csv"))
            {
                isDocsTable = true;
            }
            else if (filename.Equals("..\\..\\SEDocumentExtraction\\Total.csv"))
            {
                isTotal = true;
            }

            try
            {
                int          count   = 0;
                StreamReader scanner = File.OpenText(filename);
                string       s       = String.Empty;
                while ((s = scanner.ReadLine()) != null)
                {
                    string[] separate = s.Split(',');

                    if (isDict && separate.Length == 4 && count > 0)
                    {
                        dict[separate[0]] = new SearchDictionary(int.Parse(separate[1].Trim()), int.Parse(separate[2].Trim()), int.Parse(separate[3].Trim()));
                    }
                    else if (isPosting && separate.Length == 2 && count > 0)
                    {
                        post[count - 1] = new Posting(int.Parse(separate[0].Trim()), int.Parse(separate[1].Trim()));
                    }
                    else if (isDocsTable && separate.Length == 5 && count > 0)
                    {
                        docs[int.Parse(separate[0].Trim())] = new Docs_Info(separate[1].Trim(), int.Parse(separate[2].Trim()), separate[3].Trim(), separate[4].Trim());
                    }
                    else if (isTotal && count > 0)
                    {
                        collectionSize = int.Parse(s.Trim());
                    }

                    count = count + 1;
                }

                scanner.Close();
            }
            catch (FileNotFoundException e)
            {
                Console.WriteLine("File Not Found !!Please Enter Proper FileName !!");
                return;
            }
        }