Beispiel #1
0
        // Merge 2 inverted_index list by comparing the doc id and appending if it matchs
        private List <inverted_index> MergeIndexValues(List <inverted_index> list1, List <inverted_index> list2)
        {
            List <inverted_index> inverted_index_val = new List <inverted_index> {
            };

            for (int j = 0; j < list1.Count; j++)
            {
                for (int i = 0; i < list2.Count; i++)
                {
                    if (list1[j].DocId == list2[i].DocId)
                    {
                        inverted_index inverted_index_obj = new inverted_index
                        {
                            DocId     = list1[j].DocId,
                            Frequency = list1[j].Frequency + list2[i].Frequency + 1,
                            Position  = list1[j].Position + "," + list2[i].Position
                        };
                        inverted_index_val.Add(inverted_index_obj);
                        list1.Remove(list1[j]);
                        list2.Remove(list2[i]);
                        i--; j--;
                        break;
                    }
                }
            }

            inverted_index_val.AddRange(list1);
            inverted_index_val.AddRange(list2);
            return(inverted_index_val);
        }
Beispiel #2
0
        //insert_into_DB_invertedindexTable
        //that have all the stemming words and its pos and its freq etc.
        public static void insert_into_DB_invertedindexTable(string term, inverted_index item)
        {
            //SQLQuery
            SqlConnection con = new SqlConnection(connectionString);

            con.Open();
            string     insertStr = @" insert into invertedIndex(Term_id, Term, DocId, Frequency, Positions)values(@Term_id, @Term, @DocId, @Frequency, @Positions)";
            SqlCommand cmd       = new SqlCommand(insertStr, con);

            SqlParameter term_id = new SqlParameter("@Term_id", item.Term_id);

            cmd.Parameters.Add(term_id);

            SqlParameter term_name = new SqlParameter("@Term", term);

            cmd.Parameters.Add(term_name);

            SqlParameter doc_id = new SqlParameter("@DocId", item.DocId);

            cmd.Parameters.Add(doc_id);

            SqlParameter freq = new SqlParameter("@Frequency", item.Frequency);

            cmd.Parameters.Add(freq);

            SqlParameter pos = new SqlParameter("@Positions", item.Position);

            cmd.Parameters.Add(pos);

            cmd.ExecuteNonQuery();

            con.Close();
        }
Beispiel #3
0
        //the whole indexing process
        public void indexing()
        {
            //select all the contents of the URLs
            List <URL_Data> contents = SQL_DB.GetAllUrlData();

            //loop on these contents
            for (int i = 0; i < contents.Count; i++)
            {
                //1. Parse each Content (to extract the text)
                //didn't use to reduse the run time beacause we have managed data
                // string textOfContent = ParseHTMLContent(contents[i].content);


                //2.tokenize each content
                string        textOfContent       = contents[i].content;
                List <string> splittedContentList = tokenize(textOfContent);

                //3.linguistics Algo
                List <string> doc_Tokens = linguistics_Algo(splittedContentList);

                //4.dic to hold terms without stopwords , and position of term in the doc before remove the stopwords (as comma seprated string)
                Dictionary <string, string> doc_Terms = new Dictionary <string, string>();
                for (int j = 0; j < doc_Tokens.Count; j++)
                {
                    if (!Constants.StopWords.Contains(doc_Tokens[j]))
                    {
                        string term = doc_Tokens[j];
                        if (doc_Terms.Keys.Contains(term))
                        {
                            doc_Terms[term] = doc_Terms[term] + "," + j.ToString();
                        }
                        else
                        {
                            doc_Terms.Add(term, j.ToString());
                        }
                    }
                }
                //big dic to hold all the unique terms of all documents(dic=(term,list[docID,freq,pos]))
                ////this list holds many val.
                //so this dic holds all words from all documents without repetition and without stopwords and with all its info in a list
                foreach (var dTerm in doc_Terms)
                {
                    inverted_index doc_term_data = new inverted_index
                    {
                        DocId     = contents[i].ID,
                        Frequency = dTerm.Value.Count(c => c == ',') + 1,
                        Position  = dTerm.Value
                    };

                    if (terms.Keys.Contains(dTerm.Key))
                    {
                        //new value only
                        (terms[dTerm.Key]).Add(doc_term_data);
                    }
                    else
                    {
                        //new row (key, value)
                        terms.Add(dTerm.Key, new List <inverted_index> {
                            doc_term_data
                        });
                    }
                }
            }

            //save the term and its doc_id in a table before stemming to build next module
            foreach (var term in terms)
            {
                for (int i = 0; i < term.Value.Count; i++)
                {
                    //if ((Regex.IsMatch(term.Key, "^[a-zA-Z0-9]*$")) && (term.Key != " "))
                    if ((Regex.IsMatch(term.Key, @"^[a-zA-Z]+$")) && (term.Key != " "))
                    {
                        //Console.WriteLine("filtered");
                        SQL_DB.insert_into_DB_TermDocTable(term.Key, term.Value[i]);
                    }
                    //else
                    //{
                    //    Console.WriteLine("bye bye term");
                    //}
                }
            }

            ////get the bigram of the term and put it in the DB table
            foreach (var term in terms)
            {
                string text = term.Key;

                AddBiGram(text);
            }
            //Console.WriteLine("5alas add bigram");
            //get the soundex of the term and put it in the DB table
            foreach (var term in terms)
            {
                string single_term = term.Key;

                AddSoundex(single_term);
            }

            inverted_index_table = stemming(); //stemming Algo

            //building the inverted_index after stemming
            foreach (var rec in inverted_index_table)
            {
                if ((Regex.IsMatch(rec.Key, @"^[a-zA-Z]+$")) && (rec.Key != " "))
                {
                    int term_id = SQL_DB.insert_into_DB_termsTable(rec.Key);
                    for (int i = 0; i < rec.Value.Count; i++)
                    {
                        rec.Value[i].Term_id = term_id;
                        SQL_DB.insert_into_DB_invertedindexTable(rec.Key, rec.Value[i]);
                    }
                }
            }
        }