//A query may have relationship with more than one clicked-url. So it is necessary to //merge (query, clicked-url1, term weights1), (query, clicked-url2, term weights2) ... (query, clicked-urlN, term weightN) //into a(query, term weight). public static void MergeQueryWeight(string strSavedFileName) { StreamWriter sw = new StreamWriter(strSavedFileName, false, Encoding.UTF8); Console.WriteLine("Sorting query dictionary..."); queryDict.Sort(0, queryDictIdx); List <QueryItem> qiList = new List <QueryItem>(); string strQuery = queryDict[0].strQuery; qiList.Add(queryDict[0].qi); for (long k = 1; k < queryDictIdx; k++) { if (strQuery != queryDict[k].strQuery) { //Merge result QueryItem rstQueryItem = MergeOneQuery(qiList); if (rstQueryItem != null) { //Calcuate query's total clicked freq in sum int iTotalFreq = 0; foreach (QueryItem item in qiList) { iTotalFreq += item.freq; } string strOutput = strQuery + "\t" + iTotalFreq.ToString() + "\t"; for (int i = 0; i < rstQueryItem.tokenList.Count; i++) { strOutput = strOutput + rstQueryItem.tokenList[i].strTerm + "[" + rstQueryItem.tokenList[i].fWeight.ToString("0.##") + "]\t"; } sw.WriteLine(strOutput); } //Clear current list qiList.Clear(); } strQuery = queryDict[k].strQuery; qiList.Add(queryDict[k].qi); } if (qiList.Count > 0) { //Merge result QueryItem rstQueryItem = MergeOneQuery(qiList); //Calcuate query's total clicked freq in sum int iTotalFreq = 0; foreach (QueryItem item in qiList) { iTotalFreq += item.freq; } string strOutput = strQuery + "\t" + iTotalFreq.ToString() + "\t"; for (int i = 0; i < rstQueryItem.tokenList.Count; i++) { strOutput = strOutput + rstQueryItem.tokenList[i].strTerm + "[" + rstQueryItem.tokenList[i].fWeight.ToString("0.##") + "]\t"; } sw.WriteLine(strOutput); } sw.Close(); }
private static bool StatTermWeightInQuery(List <QueryItem> qiList) { //Check each query item in qiList, and save good items for term weight calucating int goodQueryItemCount = 0; foreach (QueryItem item in qiList) { List <Token> tknList = new List <Token>(); bool bIgnored = false; //Break query string by given dictionary wordseg.Segment(item.strQuery, tokens, false); for (int i = 0; i < tokens.tokenList.Count; i++) { //Normalize term and if the term is empty, ignore it WordSeg.Token wbTkn = tokens.tokenList[i]; string strTerm = wbTkn.strTerm.Trim(); if (strTerm.Length == 0) { continue; } //check duplicate terms in a query //If duplicated terms is found, drop current query for (int j = 0; j < tknList.Count; j++) { if (tknList[j].strTerm == strTerm) { //found the duplicated term bIgnored = true; break; } } if (bIgnored == true) { //Ignore the query with duplicated term break; } //Save the token into the list Token token = new Token(); token.strTerm = strTerm; tknList.Add(token); } if (bIgnored == false) { item.tokenList = tknList; goodQueryItemCount++; } else { item.tokenList = null; } } //The flag for checking whether query is sub-query or super-query for all other queries in the cluster bool bEntireCluster = false; for (int i = 0; i < qiList.Count; i++) { QueryItem selQueryItem = qiList[i]; if (selQueryItem.tokenList == null) { continue; } Dictionary <Token, int> termHash2Freq = new Dictionary <Token, int>(); int totalFreq = selQueryItem.freq; int queryInCluster = 1; foreach (Token item in selQueryItem.tokenList) { termHash2Freq.Add(item, selQueryItem.freq); } //Check selQueryItem's sub-query and super-query, and statistic frequency for (int j = 0; j < qiList.Count; j++) { if (i != j && qiList[j].tokenList != null) { List <Token> joinBList = new List <Token>(); //Try to find selQueryItem's sub-query if (AsubOfB(qiList[j].tokenList, selQueryItem.tokenList, joinBList) == true) { //Found a sub-query of selQueryItem //Increase the cluster, query and term's frequency queryInCluster++; totalFreq += qiList[j].freq; foreach (Token item in joinBList) { termHash2Freq[item] += qiList[j].freq; } } //Try to find selQueryItem's super-query else if (AsuperOfB(qiList[j].tokenList, selQueryItem.tokenList, joinBList) == true) { //Found a super-query of selQueryItem //Increase the cluster, query and term's frequency queryInCluster++; totalFreq += qiList[j].freq; foreach (Token item in joinBList) { termHash2Freq[item] += qiList[j].freq; } } } } if (queryInCluster < MIN_CLUSTER_SIZE) { //The generated cluster is too small, ignore current query item selQueryItem.tokenList = null; continue; } if (queryInCluster == goodQueryItemCount && selQueryItem.strQuery.Length >= 2) { //All other queries are current query's sub or super set. bEntireCluster = true; } foreach (Token item in selQueryItem.tokenList) { double fWeight = ((double)termHash2Freq[item]) / ((double)totalFreq); item.fWeight = fWeight; } } return(bEntireCluster); }
private static void Main(string[] args) { if (args.Length != 6) { Console.WriteLine("StatTermWeightInQuery [input:query_clusterId_freq filename] [output:query_term_weight filename] [input:min query_clusterId frequency] [input:min cluster size] [input:word breaker lexical dictionary] [input:normalize mapping filename]"); return; } //Initialize parameters MIN_QUERY_URL_PAIR_FREQUENCY = int.Parse(args[2]); MIN_CLUSTER_SIZE = int.Parse(args[3]); InitializeWordBreaker(args[4]); LoadNormalizedMappingFile(args[5]); Console.WriteLine("Start to process..."); StreamReader reader = new StreamReader(args[0]); string lastUrl = ""; List <QueryItem> qiList = new List <QueryItem>(); string strLine = null; while ((strLine = reader.ReadLine()) != null) { strLine = strLine.ToLower().Trim(); string[] strArray = strLine.Split(new char[] { '\t' }); if (strArray.Length < 3) { Console.WriteLine("Invalidated line: {0}", strLine); continue; } QueryItem item = null; try { //Construct query item instance item = new QueryItem { strQuery = NormalizeQuery(strArray[0]), freq = int.Parse(strArray[2]) }; } catch (Exception) { Console.WriteLine("Invalidated line: {0}", strLine); continue; } if (item.freq >= MIN_QUERY_URL_PAIR_FREQUENCY) { string strUrl = strArray[1]; if ((lastUrl.Length > 0) && (strUrl != lastUrl)) { if (qiList.Count >= 2) { //Statistics terms weight in each query-url cluster if (StatTermWeightInQuery(qiList) == true) { AddQueryList(qiList); } } qiList = new List <QueryItem>(); } qiList.Add(item); lastUrl = strUrl; } } //Stat the last query-url cluster if (qiList.Count >= 2) { if (StatTermWeightInQuery(qiList) == true) { AddQueryList(qiList); } } Console.WriteLine("Merging clusters..."); MergeQueryWeight(args[1]); reader.Close(); }