private static string NormalizeQuery(string strQuery) { StringBuilder sb = new StringBuilder(); //Break query string by given dictionary wordseg.Segment(strQuery, tokens, false); for (int i = 0; i < tokens.tokenList.Count; i++) { //Normalize term and if the term is empty, ignore it WordSeg.Token wbTkn = tokens.tokenList[i]; string strTerm = NormalizeTerm(wbTkn.strTerm).Trim(); if (strTerm.Length == 0) { continue; } sb.Append(strTerm); sb.Append(" "); } return(sb.ToString().Trim()); }
private static bool StatTermWeightInQuery(List <QueryItem> qiList) { //Check each query item in qiList, and save good items for term weight calucating int goodQueryItemCount = 0; foreach (QueryItem item in qiList) { List <Token> tknList = new List <Token>(); bool bIgnored = false; //Break query string by given dictionary wordseg.Segment(item.strQuery, tokens, false); for (int i = 0; i < tokens.tokenList.Count; i++) { //Normalize term and if the term is empty, ignore it WordSeg.Token wbTkn = tokens.tokenList[i]; string strTerm = wbTkn.strTerm.Trim(); if (strTerm.Length == 0) { continue; } //check duplicate terms in a query //If duplicated terms is found, drop current query for (int j = 0; j < tknList.Count; j++) { if (tknList[j].strTerm == strTerm) { //found the duplicated term bIgnored = true; break; } } if (bIgnored == true) { //Ignore the query with duplicated term break; } //Save the token into the list Token token = new Token(); token.strTerm = strTerm; tknList.Add(token); } if (bIgnored == false) { item.tokenList = tknList; goodQueryItemCount++; } else { item.tokenList = null; } } //The flag for checking whether query is sub-query or super-query for all other queries in the cluster bool bEntireCluster = false; for (int i = 0; i < qiList.Count; i++) { QueryItem selQueryItem = qiList[i]; if (selQueryItem.tokenList == null) { continue; } Dictionary <Token, int> termHash2Freq = new Dictionary <Token, int>(); int totalFreq = selQueryItem.freq; int queryInCluster = 1; foreach (Token item in selQueryItem.tokenList) { termHash2Freq.Add(item, selQueryItem.freq); } //Check selQueryItem's sub-query and super-query, and statistic frequency for (int j = 0; j < qiList.Count; j++) { if (i != j && qiList[j].tokenList != null) { List <Token> joinBList = new List <Token>(); //Try to find selQueryItem's sub-query if (AsubOfB(qiList[j].tokenList, selQueryItem.tokenList, joinBList) == true) { //Found a sub-query of selQueryItem //Increase the cluster, query and term's frequency queryInCluster++; totalFreq += qiList[j].freq; foreach (Token item in joinBList) { termHash2Freq[item] += qiList[j].freq; } } //Try to find selQueryItem's super-query else if (AsuperOfB(qiList[j].tokenList, selQueryItem.tokenList, joinBList) == true) { //Found a super-query of selQueryItem //Increase the cluster, query and term's frequency queryInCluster++; totalFreq += qiList[j].freq; foreach (Token item in joinBList) { termHash2Freq[item] += qiList[j].freq; } } } } if (queryInCluster < MIN_CLUSTER_SIZE) { //The generated cluster is too small, ignore current query item selQueryItem.tokenList = null; continue; } if (queryInCluster == goodQueryItemCount && selQueryItem.strQuery.Length >= 2) { //All other queries are current query's sub or super set. bEntireCluster = true; } foreach (Token item in selQueryItem.tokenList) { double fWeight = ((double)termHash2Freq[item]) / ((double)totalFreq); item.fWeight = fWeight; } } return(bEntireCluster); }