static void Main(string[] args) { if (args.Length < 3) { Console.WriteLine("lm_score.exe [LexDict file] [language model file] [ngram-order] <input file> <output file>"); Console.WriteLine(" if <input file> and <output file> is empty, the input/output will be re-directed to console."); return; } WordSeg.WordSeg wordseg = new WordSeg.WordSeg(); WordSeg.Tokens tokens = null; //Load lexical dictionary for word breaking wordseg.LoadLexicalDict(args[0], true); tokens = wordseg.CreateTokens(); //Load language model LMDecoder.KNDecoder lmDecoder = new LMDecoder.KNDecoder(); lmDecoder.LoadLM(args[1]); StreamReader sr = null; if (args.Length >= 4) { sr = new StreamReader(args[3]); } StreamWriter sw = null; if (args.Length >= 5) { sw = new StreamWriter(args[4]); } Console.WriteLine("Ready..."); if (sw == null) { Console.WriteLine("Text\tProbability\tOOV\tPerplexity"); } else { sw.WriteLine("Text\tProbability\tOOV\tPerplexity"); } int order = int.Parse(args[2]); while (true) { string strLine = null; if (sr == null) { strLine = Console.ReadLine(); } else { strLine = sr.ReadLine(); } //Empty line, exit if (strLine == null || strLine.Length == 0) { break; } //Only use the first column string[] items = strLine.Split('\t'); string strText = items[0]; //Segment text by lexical dictionary wordseg.Segment(strText, tokens, false); StringBuilder sb = new StringBuilder(); //Parse each broken token for (int i = 0; i < tokens.tokenList.Count; i++) { string strTerm = tokens.tokenList[i].strTerm.Trim(); if (strTerm.Length > 0) { sb.Append(strTerm); sb.Append(" "); } } strText = sb.ToString().Trim(); LMResult LMRst = lmDecoder.GetSentProb(strText, order); if (sw == null) { Console.WriteLine("{0}\t{1}\t{2}\t{3}", strText, LMRst.logProb, LMRst.oovs, LMRst.perplexity); } else { sw.WriteLine("{0}\t{1}\t{2}\t{3}", strText, LMRst.logProb, LMRst.oovs, LMRst.perplexity); } } if (sr != null) { sr.Close(); } if (sw != null) { sw.Close(); } }
static void Main(string[] args) { if (args.Length != 6) { Console.WriteLine("BuildQueryTermWeightCorpus.exe [Min frequency in query] [Query Segment Labels] [Min Segment Gap] [Lexical dictionary file name] [Query term weight score file name] [Training corpus file name]"); return; } int minFreq = int.Parse(args[0]); string[] labItems = args[1].Split(','); MAX_THRESHOLD_NUM = labItems.Length - 1; MIN_WEIGHT_SCORE_GAP = double.Parse(args[2]); wordseg = new WordSeg.WordSeg(); //Load lexical dictionary wordseg.LoadLexicalDict(args[3], true); //Initialize word breaker's token instance wbTokens = wordseg.CreateTokens(); if (File.Exists(args[4]) == false) { Console.WriteLine("Query term weight file {0} is not existed.", args[4]); return; } StreamReader sr = new StreamReader(args[4]); StreamWriter sw = new StreamWriter(args[5]); while (sr.EndOfStream == false) { string strLine = sr.ReadLine(); string[] items = strLine.Split('\t'); string strRawQuery = items[0]; int queryFreq = int.Parse(items[1]); //Ignore queries with less frequency if (int.Parse(items[1]) < minFreq) { continue; } try { //Get query features SortedDictionary <double, int> sdict = new SortedDictionary <double, int>(); double maxWeight = -1.0; for (int i = 2; i < items.Length; i++) { if (items[i].Trim().Length == 0) { //Ignore empty term continue; } //Parse item and get term string and its weight score int pos = items[i].IndexOf('['); string strTerm = items[i].Substring(0, pos).Trim().ToLower(); if (strTerm.Length == 0) { continue; } string strWeight = items[i].Substring(pos + 1, items[i].Length - (pos + 1) - 1); double fWeight = double.Parse(strWeight); if (sdict.ContainsKey(fWeight) == false) { sdict.Add(fWeight, 0); } sdict[fWeight] += strTerm.Length; //Try to get maximum weight score in current query if (fWeight >= maxWeight) { maxWeight = fWeight; } } if (maxWeight < 1.0) { continue; } //Sort weight score list List <ScoreItem> scoreList = new List <ScoreItem>(); foreach (KeyValuePair <double, int> pair in sdict.Reverse()) { ScoreItem scoreItem = new ScoreItem(); scoreItem.score = pair.Key; scoreItem.bThreshold = false; scoreItem.gap = 0.0; scoreList.Add(scoreItem); } //Find topN max threshold gaps List <double> thresholdGapList = CalcTopNMaxThresholdGap(scoreList, MAX_THRESHOLD_NUM, MIN_WEIGHT_SCORE_GAP); if (thresholdGapList.Count != MAX_THRESHOLD_NUM) { //If the number of threshold gap is different with the specified number in command line, ignore this query continue; } if (thresholdGapList.Count > 0) { int coreCnt = 0; int otherCnt = 0; //Count the number of core terms foreach (KeyValuePair <double, int> pair in sdict) { if (pair.Key > thresholdGapList[0]) { coreCnt += pair.Value; } else { otherCnt += pair.Value; } } if (coreCnt < 2) { continue; } } List <Token> tkList = new List <Token>(); for (int i = 2; i < items.Length; i++) { if (items[i].Trim().Length == 0) { continue; } //Parse training corpus int pos = items[i].IndexOf('['); string strTerm = items[i].Substring(0, pos).Trim().ToLower(); if (strTerm.Length == 0) { continue; } string strWeight = items[i].Substring(pos + 1, items[i].Length - (pos + 1) - 1); double fWeight = double.Parse(strWeight); Token tk = new Token(); tk.fWeight = fWeight; tk.strTerm = strTerm; bool bProcessed = false; //Label tags according term's weight and thresholds for (int j = 0; j < thresholdGapList.Count; j++) { if (fWeight > thresholdGapList[j]) { tk.strTag = labItems[j]; bProcessed = true; break; } } //Label the last part if (bProcessed == false) { int j = thresholdGapList.Count; tk.strTag = labItems[j]; } tkList.Add(tk); } //Merge contigous terms with same tag tkList = MergeTokenList(tkList); //Re-segment terms by given lexical dictionary tkList = ResegmentTokenList(tkList); string strOutput = ""; //Generate term important level corpus foreach (Token tk in tkList) { string strTag = tk.strTag; strOutput += tk.strTerm + "[" + strTag + "] "; } sw.WriteLine("{0}\t{1}\t{2}", strRawQuery, queryFreq, strOutput.Trim()); } catch (Exception) { Console.WriteLine("Invalidated sentence: {0}", strLine); } } sr.Close(); sw.Close(); }