예제 #1
0
        private static string NormalizeQuery(string strQuery)
        {
            StringBuilder sb = new StringBuilder();

            //Break query string by given dictionary
            wordseg.Segment(strQuery, tokens, false);
            for (int i = 0; i < tokens.tokenList.Count; i++)
            {
                //Normalize term and if the term is empty, ignore it
                WordSeg.Token wbTkn   = tokens.tokenList[i];
                string        strTerm = NormalizeTerm(wbTkn.strTerm).Trim();
                if (strTerm.Length == 0)
                {
                    continue;
                }

                sb.Append(strTerm);
                sb.Append(" ");
            }

            return(sb.ToString().Trim());
        }
예제 #2
0
        private static bool StatTermWeightInQuery(List <QueryItem> qiList)
        {
            //Check each query item in qiList, and save good items for term weight calucating
            int goodQueryItemCount = 0;

            foreach (QueryItem item in qiList)
            {
                List <Token> tknList  = new List <Token>();
                bool         bIgnored = false;
                //Break query string by given dictionary
                wordseg.Segment(item.strQuery, tokens, false);
                for (int i = 0; i < tokens.tokenList.Count; i++)
                {
                    //Normalize term and if the term is empty, ignore it
                    WordSeg.Token wbTkn   = tokens.tokenList[i];
                    string        strTerm = wbTkn.strTerm.Trim();
                    if (strTerm.Length == 0)
                    {
                        continue;
                    }

                    //check duplicate terms in a query
                    //If duplicated terms is found, drop current query
                    for (int j = 0; j < tknList.Count; j++)
                    {
                        if (tknList[j].strTerm == strTerm)
                        {
                            //found the duplicated term
                            bIgnored = true;
                            break;
                        }
                    }

                    if (bIgnored == true)
                    {
                        //Ignore the query with duplicated term
                        break;
                    }

                    //Save the token into the list
                    Token token = new Token();
                    token.strTerm = strTerm;
                    tknList.Add(token);
                }

                if (bIgnored == false)
                {
                    item.tokenList = tknList;
                    goodQueryItemCount++;
                }
                else
                {
                    item.tokenList = null;
                }
            }

            //The flag for checking whether query is sub-query or super-query for all other queries in the cluster
            bool bEntireCluster = false;

            for (int i = 0; i < qiList.Count; i++)
            {
                QueryItem selQueryItem = qiList[i];
                if (selQueryItem.tokenList == null)
                {
                    continue;
                }

                Dictionary <Token, int> termHash2Freq = new Dictionary <Token, int>();
                int totalFreq      = selQueryItem.freq;
                int queryInCluster = 1;
                foreach (Token item in selQueryItem.tokenList)
                {
                    termHash2Freq.Add(item, selQueryItem.freq);
                }

                //Check selQueryItem's sub-query and super-query, and statistic frequency
                for (int j = 0; j < qiList.Count; j++)
                {
                    if (i != j && qiList[j].tokenList != null)
                    {
                        List <Token> joinBList = new List <Token>();
                        //Try to find selQueryItem's sub-query
                        if (AsubOfB(qiList[j].tokenList, selQueryItem.tokenList, joinBList) == true)
                        {
                            //Found a sub-query of selQueryItem
                            //Increase the cluster, query and term's frequency
                            queryInCluster++;
                            totalFreq += qiList[j].freq;
                            foreach (Token item in joinBList)
                            {
                                termHash2Freq[item] += qiList[j].freq;
                            }
                        }
                        //Try to find selQueryItem's super-query
                        else if (AsuperOfB(qiList[j].tokenList, selQueryItem.tokenList, joinBList) == true)
                        {
                            //Found a super-query of selQueryItem
                            //Increase the cluster, query and term's frequency
                            queryInCluster++;
                            totalFreq += qiList[j].freq;
                            foreach (Token item in joinBList)
                            {
                                termHash2Freq[item] += qiList[j].freq;
                            }
                        }
                    }
                }

                if (queryInCluster < MIN_CLUSTER_SIZE)
                {
                    //The generated cluster is too small, ignore current query item
                    selQueryItem.tokenList = null;
                    continue;
                }

                if (queryInCluster == goodQueryItemCount && selQueryItem.strQuery.Length >= 2)
                {
                    //All other queries are current query's sub or super set.
                    bEntireCluster = true;
                }

                foreach (Token item in selQueryItem.tokenList)
                {
                    double fWeight = ((double)termHash2Freq[item]) / ((double)totalFreq);
                    item.fWeight = fWeight;
                }
            }

            return(bEntireCluster);
        }