コード例 #1
0
        //the bigram of a word table insertion
        public static void AddBiGram(string text)
        {
            //take the term and put the doolar sign with the first letter and the last letter and we must not repeat the bi grams so we will check the occurance first

            for (int i = -1; i < text.Length; i++)
            {
                string dollar_sign = "$";
                string BI1;
                string BI2;
                if (i == -1)
                {
                    BI1 = dollar_sign;
                }
                else
                {
                    BI1 = text[i].ToString();
                }

                if (i + 1 == text.Length)
                {
                    BI2 = dollar_sign;
                }
                else
                {
                    BI2 = text[i + 1].ToString();
                }

                string BI;

                BI = BI1 + BI2;
                SQL_DB.insert_into_DB_BIGRAMTable(BI);
            }
        }
コード例 #2
0
        //the soundex code table insertion
        public static void AddSoundex(string single_term)
        {
            if (single_term == "")
            {
                return;
            }

            string codeterm = single_term[0].ToString();

            //Loop On Term Length
            for (int i = 1; i <= single_term.Length - 1; i++)
            {
                //Get The Code Of EACH Character
                foreach (var c in Constants.DictionarysoundexList)
                {
                    if (c.Key.Contains(single_term[i]))
                    {
                        codeterm += c.Value;
                        break;
                    }
                }
            }

            //Loop On Code Term And Remove Repeated Character
            for (int j = codeterm.Length - 1; j >= 1; j--)
            {
                if (codeterm[j] == codeterm[j - 1])
                {
                    codeterm = codeterm.Remove(j, 1);
                }
            }

            //Remove All Zeros
            for (int n = 0; n < codeterm.Length; n++)
            {
                if (codeterm[n].Equals('0'))
                {
                    codeterm = codeterm.Remove(n, 1);
                }
            }

            //If CodeLenght less than 4 Concate zeros
            if (codeterm.Length < 4)
            {
                int diff = 4 - codeterm.Length;
                for (int k = 0; k < diff; k++)
                {
                    codeterm += '0';
                }
            }

            SQL_DB.insert_into_DB_SoundexTable(codeterm);
        }
コード例 #3
0
        //the whole indexing process
        public void indexing()
        {
            //select all the contents of the URLs
            List <URL_Data> contents = SQL_DB.GetAllUrlData();

            //loop on these contents
            for (int i = 0; i < contents.Count; i++)
            {
                //1. Parse each Content (to extract the text)
                //didn't use to reduse the run time beacause we have managed data
                // string textOfContent = ParseHTMLContent(contents[i].content);


                //2.tokenize each content
                string        textOfContent       = contents[i].content;
                List <string> splittedContentList = tokenize(textOfContent);

                //3.linguistics Algo
                List <string> doc_Tokens = linguistics_Algo(splittedContentList);

                //4.dic to hold terms without stopwords , and position of term in the doc before remove the stopwords (as comma seprated string)
                Dictionary <string, string> doc_Terms = new Dictionary <string, string>();
                for (int j = 0; j < doc_Tokens.Count; j++)
                {
                    if (!Constants.StopWords.Contains(doc_Tokens[j]))
                    {
                        string term = doc_Tokens[j];
                        if (doc_Terms.Keys.Contains(term))
                        {
                            doc_Terms[term] = doc_Terms[term] + "," + j.ToString();
                        }
                        else
                        {
                            doc_Terms.Add(term, j.ToString());
                        }
                    }
                }
                //big dic to hold all the unique terms of all documents(dic=(term,list[docID,freq,pos]))
                ////this list holds many val.
                //so this dic holds all words from all documents without repetition and without stopwords and with all its info in a list
                foreach (var dTerm in doc_Terms)
                {
                    inverted_index doc_term_data = new inverted_index
                    {
                        DocId     = contents[i].ID,
                        Frequency = dTerm.Value.Count(c => c == ',') + 1,
                        Position  = dTerm.Value
                    };

                    if (terms.Keys.Contains(dTerm.Key))
                    {
                        //new value only
                        (terms[dTerm.Key]).Add(doc_term_data);
                    }
                    else
                    {
                        //new row (key, value)
                        terms.Add(dTerm.Key, new List <inverted_index> {
                            doc_term_data
                        });
                    }
                }
            }

            //save the term and its doc_id in a table before stemming to build next module
            foreach (var term in terms)
            {
                for (int i = 0; i < term.Value.Count; i++)
                {
                    //if ((Regex.IsMatch(term.Key, "^[a-zA-Z0-9]*$")) && (term.Key != " "))
                    if ((Regex.IsMatch(term.Key, @"^[a-zA-Z]+$")) && (term.Key != " "))
                    {
                        //Console.WriteLine("filtered");
                        SQL_DB.insert_into_DB_TermDocTable(term.Key, term.Value[i]);
                    }
                    //else
                    //{
                    //    Console.WriteLine("bye bye term");
                    //}
                }
            }

            ////get the bigram of the term and put it in the DB table
            foreach (var term in terms)
            {
                string text = term.Key;

                AddBiGram(text);
            }
            //Console.WriteLine("5alas add bigram");
            //get the soundex of the term and put it in the DB table
            foreach (var term in terms)
            {
                string single_term = term.Key;

                AddSoundex(single_term);
            }

            inverted_index_table = stemming(); //stemming Algo

            //building the inverted_index after stemming
            foreach (var rec in inverted_index_table)
            {
                if ((Regex.IsMatch(rec.Key, @"^[a-zA-Z]+$")) && (rec.Key != " "))
                {
                    int term_id = SQL_DB.insert_into_DB_termsTable(rec.Key);
                    for (int i = 0; i < rec.Value.Count; i++)
                    {
                        rec.Value[i].Term_id = term_id;
                        SQL_DB.insert_into_DB_invertedindexTable(rec.Key, rec.Value[i]);
                    }
                }
            }
        }