Ejemplo n.º 1
0
        public void Expectation(TrieNode p, Stack <char> stack_list, StreamWriter outfile)
        {
            if (p != null)
            {
                stack_list.Push(p.letter);
                List <Data> data_list = new List <Data>();
                if ((p.endOfWord) && (p.NumOfWords > 1) && (p.frequency >= 5)) //unigrams need not be considered: see paper for reason and we want bi/trigram to occur atleast 10 times to be considered
                {
                    TrieNode new_p;
                    string   temp = string.Join("", stack_list.ToArray().Reverse());
                    temp = temp.Trim();
                    string[] words = temp.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
                    string   word;
                    ///**************************************************** Finding the intersecting LISTS ***************/
                    for (int i = 0; i < words.Length; i++)
                    {
                        word  = " " + words[i];//root has " " to begin with
                        new_p = this.FindWord(word);
                        if (new_p == null)
                        {
                            continue;// we coudl have removed that word during pruning
                        }
                        //throw new Exception("You are trying to get expectation value for a words whose unigram is not present");
                        if (i == 0)
                        {
                            data_list = new_p.Query_list;
                        }
                        else if (data_list != null)
                        {
                            data_list = Data.intersect(data_list, new_p.Query_list); // this gives the number of queies which contain all the 3 words
                        }
                        else
                        {
                            throw new Exception("Expectation calculation:Unigram is stored without a query id");
                        }
                    }
                    p.Set_frequency = data_list.Count;
                    /* *************************************************** Finding the Hoefding's ************** */
                    p.Set_expectation = 0;
                    for (int i = 0; i < data_list.Count; i++)
                    {
                        int   number      = this.factorial(data_list[i].Q_length, p.NumOfWords);
                        float num_inverse = 0;
                        if (number < 10000000 && number != 0)
                        {
                            num_inverse        = 1 / (( float )number);
                            p.Set_expectation += num_inverse;
                        }
                    }

                    /*
                     *  Hoefding's inequality: F-E > sqrt(-K*log(s)/2)
                     *  let hoefding'score -log(s)=((F-E)^2)*2/k;
                     */
                    if (p.frequency > p.Set_expectation)
                    {
                        p.Hoefding_score = (((p.frequency - p.Set_expectation) * (p.frequency - p.Set_expectation)) * 2) / 2 * p.Set_frequency;
                        outfile.WriteLine(p.Hoefding_score + "\t" + temp);
                    }
                }
                string test = String.Join("", stack_list.ToArray().Reverse());
                Expectation(p.daughter, stack_list, outfile);
                stack_list.Pop();
                Expectation(p.sibling, stack_list, outfile);
            }

            return;
        }
Ejemplo n.º 2
0
        public void insert(string word, Data new_data, int num_words)  //num_words tells whether it is trigram or bigram. Remember each word in a bigram must be separated from other by " "
        {
            TrieNode p = this.root;
            int      i = 0;

            while (i < word.Length - 1)
            {
                if (p == null)
                {
                    break;
                }
                char test    = word[i];
                char test2   = word[i + 1];
                char pLetter = p.letter;
                if (p.letter == word[i])
                {
                    if (p.daughter != null)
                    {
                        p = p.daughter;
                    }
                    else
                    {
                        p.daughter = new TrieNode(word[i + 1]);
                        p          = p.daughter;
                        this.TotalNodes++;
                    }
                    //go to next letter in the word
                    ++i;
                }
                else
                {
                    if (p.sibling != null)
                    {
                        p = p.sibling;
                    }
                    else
                    {
                        p.sibling = new TrieNode(word[i]);
                        p         = p.sibling;
                        this.TotalNodes++;
                    }
                }
            }

            //handling the last character as a special case
            while (i < word.Length)
            {
                if (p == null)
                {
                    break;
                }
                if (p.letter == word[i])
                {
                    ++i;
                }
                else if (p.letter != word[i] && p.sibling != null)
                {
                    p = p.sibling;
                    continue;
                }
                else if (p.letter != word[i]) // and we have no sibling
                {
                    p.sibling = new TrieNode(word[i]);
                    this.TotalNodes++;
                    p = p.sibling;
                }

                p.endOfWord = true;
                p.frequency++;
                p.NumOfWords = num_words;
                if (num_words == 1)
                {
                    p.Query_list.Add(new_data);
                }
            }
        }
Ejemplo n.º 3
0
 public Trie()
 {
     root           = new TrieNode();
     TotalNodes     = 1;
     root.frequency = 100;
 }