Exemplo n.º 1
0
        public ActionResult geturls(string searchterms, string type)
        {
            searchquery s = new searchquery();

            s.query = searchterms;
            string[] searchquery_terms = s.Query_to_words();

            char[] searchquery_arr = searchterms.ToCharArray();
            Dictionary <Tuple <string, int>, Tuple <int, List <int> > > dict = new Dictionary <Tuple <string, int>, Tuple <int, List <int> > >();
            Dictionary <String, int> Ndocs = new Dictionary <string, int>();

            bool       exact_search = false;
            List <int> Doc_No       = new List <int>();

            if (searchquery_arr[0] == '"' && searchquery_arr[searchquery_arr.Length - 1] == '"')
            {
                s.query      = searchterms.Substring(1, searchterms.Length - 2);
                exact_search = true;
            }
            List <string> searchquery_terms_stemmed = new List <string>();

            SqlConnection con = new SqlConnection(@"Data Source=HOSSAM\MOHAMEDHOSSAM;Initial Catalog=web_crawler;Integrated Security=True");

            con.Open();

            for (int i = 0; i < searchquery_terms.Length; i++)
            {
                if (!Remove_stopwords(searchquery_terms[i]))
                {
                    Porter stemer = new Porter();                      //object from porter stemmer
                    string output = stemer.stem(searchquery_terms[i]); //pass the term for the stemmer to apply the porter stemmer on it
                    searchquery_terms_stemmed.Add(output);


                    SqlCommand cmd = new SqlCommand("select * from Inverted_index where Term=@term", con);
                    // SqlParameter term = new SqlParameter("@term",output);//current term in dictionary
                    // cmd.Parameters.Add(term);//select all records 'page content' from crawler database
                    cmd.Parameters.Add("@term", SqlDbType.VarChar).Value = output;
                    SqlDataReader reader = cmd.ExecuteReader();  //reader on database
                    if (!reader.HasRows && type == "spell")
                    {
                        misSpelledword = searchquery_terms[i];
                    }
                    else
                    {
                        int repeation = 0;
                        while (reader.Read())
                        {
                            string word   = reader[0].ToString();
                            int    doc_id = (int)reader[1];
                            if (!Doc_No.Contains(doc_id))
                            {
                                Doc_No.Add(doc_id);
                            }
                            int        frequency      = (int)reader[2];
                            string     positions      = reader[3].ToString();
                            List <int> term_positions = new List <int>();
                            string[]   arr            = positions.Split(',');
                            int[]      position       = Array.ConvertAll(arr, int.Parse);

                            for (int j = 0; j < position.Length; j++)
                            {
                                term_positions.Add(position[j]);
                            }
                            dict.Add(Tuple.Create(word, doc_id), Tuple.Create(frequency, term_positions));
                            repeation++;
                        }
                        Ndocs.Add(output, repeation);
                    }
                    reader.Close();
                }
            }

            List <Tuple <int, float, List <Tuple <String, int, List <int> > > > > Exact_Docs   = new List <Tuple <int, float, List <Tuple <string, int, List <int> > > > >();
            List <Tuple <int, float, List <Tuple <String, int, List <int> > > > > Inexact_Docs = new List <Tuple <int, float, List <Tuple <string, int, List <int> > > > >();
            List <String> Urls = new List <string>();

            if (exact_search)
            {
                List <Tuple <int, List <Tuple <String, int, List <int> > > > > docs = new List <Tuple <int, List <Tuple <String, int, List <int> > > > >();
                for (int i = 0; i < Doc_No.Count; i++)
                {
                    int        count = 0, frequency;
                    List <int> allpositions = new List <int>();
                    List <Tuple <String, int, List <int> > > words = new List <Tuple <string, int, List <int> > >();

                    for (int j = 0; j < searchquery_terms_stemmed.Count; j++)
                    {
                        if (dict.ContainsKey(Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])))
                        {
                            count++;
                            allpositions = dict[Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])].Item2;
                            frequency    = dict[Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])].Item1;
                            words.Add(Tuple.Create(searchquery_terms_stemmed[j], frequency, allpositions));
                        }
                    }
                    if (count == searchquery_terms_stemmed.Count)
                    {
                        docs.Add(Tuple.Create(Doc_No[i], words));
                    }
                }

                List <Tuple <int, float, List <Tuple <String, int, List <int> > > > > Docs_distances = check_distance(docs, searchquery_terms_stemmed.Count, Ndocs);
                foreach (var item in Docs_distances.OrderByDescending(Key => Key.Item2))
                {
                    Exact_Docs.Add(item);
                }

                Urls = Read_URls_from_database(Exact_Docs);
            }
            else
            {
                Dictionary <int, List <Tuple <int, List <Tuple <String, int, List <int> > > > > > num_of_occurence = new Dictionary <int, List <Tuple <int, List <Tuple <String, int, List <int> > > > > >();
                for (int i = 0; i < Doc_No.Count; i++)
                {
                    int        count = 0, frequency;
                    List <int> allpositions = new List <int>();
                    List <Tuple <String, int, List <int> > > word = new List <Tuple <string, int, List <int> > >();

                    for (int j = 0; j < searchquery_terms_stemmed.Count; j++)
                    {
                        if (dict.ContainsKey(Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])))
                        {
                            count++;
                            allpositions = dict[Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])].Item2;
                            frequency    = dict[Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])].Item1;
                            word.Add(Tuple.Create(searchquery_terms_stemmed[j], frequency, allpositions));
                        }
                    }
                    if (num_of_occurence.ContainsKey(count))
                    {
                        num_of_occurence[count].Add(Tuple.Create(Doc_No[i], word));
                    }
                    else
                    {
                        List <Tuple <int, List <Tuple <String, int, List <int> > > > > alldocs = new List <Tuple <int, List <Tuple <String, int, List <int> > > > >();
                        alldocs.Add(Tuple.Create(Doc_No[i], word));
                        num_of_occurence.Add(count, alldocs);
                    }
                }

                foreach (var Item in num_of_occurence.OrderByDescending(key => key.Key))
                {
                    List <Tuple <int, float, List <Tuple <String, int, List <int> > > > > docs_distances = calculate_distance(Item.Value, Item.Key);
                    foreach (var item in docs_distances.OrderBy(key => key.Item2))
                    {
                        Inexact_Docs.Add(item);
                    }
                }
                Urls = Read_URls_from_database(Inexact_Docs);
                if (misSpelledword != "" && type == "spell")
                {
                    List <String> nearest_words = spellchecker_words(misSpelledword);
                    ViewBag.nearest_words = nearest_words;
                }
                else if (type == "soundex")
                {
                    misSpelledword = searchquery_terms[0];
                    List <String> soundex_words = Soundex_words(misSpelledword);
                    ViewBag.nearest_words = soundex_words;
                }
            }

            ViewBag.Urls  = Urls;
            ViewBag.query = searchterms;
            ViewBag.type  = type;
            return(View());
        }
Exemplo n.º 2
0
        public ActionResult searchquery(string searchterms, string type)
        {
            searchquery s = new searchquery();

            s.query = searchterms;

            //1- apply tokenization and casefolding

            string[] searchquery_terms = s.Query_to_words();
            char[]   searchquery_arr   = searchterms.ToCharArray();
            Dictionary <Tuple <string, int>, Tuple <int, List <int> > > dict = new Dictionary <Tuple <string, int>, Tuple <int, List <int> > >(); //store word and its docid as a key, its frequency and list of its positions as value
            Dictionary <String, int> Ndocs = new Dictionary <string, int>();                                                                      //Dictionary to store the word and its frequency

            bool       exact_search = false;
            List <int> Doc_No       = new List <int>();

            //to check if the user want to exact search
            if (searchquery_arr[0] == '"' && searchquery_arr[searchquery_arr.Length - 1] == '"')
            {
                s.query      = searchterms.Substring(1, searchterms.Length - 2); //if yes , take whole query except the two double quotation
                exact_search = true;
            }


            //2- apply removing stop words and porter stemmer
            List <string> searchquery_terms_stemmed = new List <string>();
            SqlConnection con = new SqlConnection(@"Data Source=DESKTOP-KMG2RBB\SQLEXPRESS;Initial Catalog=web_crawler;Integrated Security=True");

            con.Open();

            for (int i = 0; i < searchquery_terms.Length; i++)
            {
                if (!Remove_stopwords(searchquery_terms[i]))           // if word not stop word
                {
                    Porter stemer = new Porter();                      //object from porter stemmer
                    string output = stemer.stem(searchquery_terms[i]); //pass the term for the stemmer to apply the porter stemmer on it
                    searchquery_terms_stemmed.Add(output);


                    SqlCommand cmd = new SqlCommand("select * from Inverted_index where Term=@term", con);
                    cmd.Parameters.Add("@term", SqlDbType.VarChar).Value = output;
                    SqlDataReader reader = cmd.ExecuteReader();  //reader on database

                    //if the reader return nothing , and we want to apply spelling correction in this word ,store it
                    if (!reader.HasRows && type == "spell")
                    {
                        misSpelledword = searchquery_terms[i];
                    }

                    // if normal search without choosing apply spelling correction or phonetic correction
                    else
                    {
                        int repeation = 0;
                        while (reader.Read())
                        {
                            string word   = reader[0].ToString();
                            int    doc_id = (int)reader[1];
                            if (!Doc_No.Contains(doc_id))
                            {
                                Doc_No.Add(doc_id);
                            }
                            int        frequency      = (int)reader[2];
                            string     positions      = reader[3].ToString();
                            List <int> term_positions = new List <int>();
                            string[]   arr            = positions.Split(',');
                            int[]      position       = Array.ConvertAll(arr, int.Parse);

                            for (int j = 0; j < position.Length; j++)
                            {
                                term_positions.Add(position[j]);
                            }
                            dict.Add(Tuple.Create(word, doc_id), Tuple.Create(frequency, term_positions));
                            repeation++;
                        }
                        Ndocs.Add(output, repeation);
                    }
                    reader.Close();
                }
            }

            List <Tuple <int, float, List <Tuple <String, int, List <int> > > > > Exact_Docs   = new List <Tuple <int, float, List <Tuple <string, int, List <int> > > > >();
            List <Tuple <int, float, List <Tuple <String, int, List <int> > > > > Inexact_Docs = new List <Tuple <int, float, List <Tuple <string, int, List <int> > > > >();
            List <String> Urls = new List <string>();

            // in exact search
            if (exact_search)
            {
                List <Tuple <int, List <Tuple <String, int, List <int> > > > > docs = new List <Tuple <int, List <Tuple <String, int, List <int> > > > >();
                for (int i = 0; i < Doc_No.Count; i++) //loops on all documents which contains the words in search query
                {
                    int        count = 0, frequency;
                    List <int> allpositions = new List <int>();
                    List <Tuple <String, int, List <int> > > words = new List <Tuple <string, int, List <int> > >(); // list of store the words and its frequency and its positions

                    for (int j = 0; j < searchquery_terms_stemmed.Count; j++)                                        //loops on words
                    {
                        if (dict.ContainsKey(Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])))
                        {
                            count++;
                            allpositions = dict[Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])].Item2; //store all positions of these word in these doc
                            frequency    = dict[Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])].Item1; //store frequency of these word in these doc
                            words.Add(Tuple.Create(searchquery_terms_stemmed[j], frequency, allpositions));
                        }
                    }

                    // if we Found all words of search query in this docs => store this docs with the words, frequecny and its positions to check if ALL query words in the same order submitted by the user or not , if yes => Sort the returned documents by Occurrence (frequency)
                    if (count == searchquery_terms_stemmed.Count)
                    {
                        docs.Add(Tuple.Create(Doc_No[i], words));
                    }
                }

                List <Tuple <int, float, List <Tuple <String, int, List <int> > > > > Docs_distances = check_distance(docs, searchquery_terms_stemmed.Count, Ndocs);
                //Ranking documents according to cumulative frequency and store it to display
                foreach (var item in Docs_distances.OrderByDescending(Key => Key.Item2))
                {
                    Exact_Docs.Add(item);
                }

                Urls = Read_URls_from_database(Exact_Docs);
            }

            // in multi word search
            else
            {
                // store in dictionary num_of_occurence (Number of words appear in the document , list of documents which include this words)
                Dictionary <int, List <Tuple <int, List <Tuple <String, int, List <int> > > > > > num_of_occurence = new Dictionary <int, List <Tuple <int, List <Tuple <String, int, List <int> > > > > >();

                //we store how many words in the user query appear in specific documents
                for (int i = 0; i < Doc_No.Count; i++)
                {
                    int        count = 0, frequency;
                    List <int> allpositions = new List <int>();
                    List <Tuple <String, int, List <int> > > word = new List <Tuple <string, int, List <int> > >();

                    for (int j = 0; j < searchquery_terms_stemmed.Count; j++)
                    {
                        if (dict.ContainsKey(Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])))
                        {
                            count++;
                            allpositions = dict[Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])].Item2;
                            frequency    = dict[Tuple.Create(searchquery_terms_stemmed[j], Doc_No[i])].Item1;
                            word.Add(Tuple.Create(searchquery_terms_stemmed[j], frequency, allpositions));
                        }
                    }
                    if (num_of_occurence.ContainsKey(count))
                    {
                        num_of_occurence[count].Add(Tuple.Create(Doc_No[i], word));
                    }
                    else
                    {
                        List <Tuple <int, List <Tuple <String, int, List <int> > > > > alldocs = new List <Tuple <int, List <Tuple <String, int, List <int> > > > >();
                        alldocs.Add(Tuple.Create(Doc_No[i], word));
                        num_of_occurence.Add(count, alldocs);
                    }
                }

                // sort the dictionary Descending with the count of the words in the search query that appears in specific document
                foreach (var Item in num_of_occurence.OrderByDescending(key => key.Key))
                {
                    List <Tuple <int, float, List <Tuple <String, int, List <int> > > > > docs_distances = calculate_distance(Item.Value, Item.Key);
                    // ranking the documents in multi word search by ascending (minimum distance between words )
                    foreach (var item in docs_distances.OrderBy(key => key.Item2))
                    {
                        Inexact_Docs.Add(item);
                    }
                }
                Urls = Read_URls_from_database(Inexact_Docs);

                //if there exist miss spell word and the user checked the radio button of spelling correction => apply spell checker algorithm
                if (misSpelledword != "" && type == "spell")
                {
                    List <String> nearest_words = spellchecker_words(misSpelledword);
                    ViewBag.nearest_words = nearest_words;
                }
                // if user checked the raido button of apply phonetic correction
                else if (type == "soundex")
                {
                    misSpelledword = searchquery_terms[0];
                    List <String> soundex_words = Soundex_words(misSpelledword);
                    ViewBag.nearest_words = soundex_words;
                }
            }

            ViewBag.type  = type;
            ViewBag.Urls  = Urls;
            ViewBag.query = searchterms;
            return(View());
        }