Example #1
0
        static void Main(string[] args)
        {
            if (args.Length < 3)
            {
                Console.WriteLine("lm_score.exe [LexDict file] [language model file] [ngram-order] <input file> <output file>");
                Console.WriteLine(" if <input file> and <output file> is empty, the input/output will be re-directed to console.");
                return;
            }

            WordSeg.WordSeg wordseg = new WordSeg.WordSeg();
            WordSeg.Tokens  tokens  = null;

            //Load lexical dictionary for word breaking
            wordseg.LoadLexicalDict(args[0], true);
            tokens = wordseg.CreateTokens();

            //Load language model
            LMDecoder.KNDecoder lmDecoder = new LMDecoder.KNDecoder();
            lmDecoder.LoadLM(args[1]);

            StreamReader sr = null;

            if (args.Length >= 4)
            {
                sr = new StreamReader(args[3]);
            }

            StreamWriter sw = null;

            if (args.Length >= 5)
            {
                sw = new StreamWriter(args[4]);
            }

            Console.WriteLine("Ready...");
            if (sw == null)
            {
                Console.WriteLine("Text\tProbability\tOOV\tPerplexity");
            }
            else
            {
                sw.WriteLine("Text\tProbability\tOOV\tPerplexity");
            }

            int order = int.Parse(args[2]);

            while (true)
            {
                string strLine = null;

                if (sr == null)
                {
                    strLine = Console.ReadLine();
                }
                else
                {
                    strLine = sr.ReadLine();
                }

                //Empty line, exit
                if (strLine == null || strLine.Length == 0)
                {
                    break;
                }

                //Only use the first column
                string[] items   = strLine.Split('\t');
                string   strText = items[0];


                //Segment text by lexical dictionary
                wordseg.Segment(strText, tokens, false);
                StringBuilder sb = new StringBuilder();
                //Parse each broken token
                for (int i = 0; i < tokens.tokenList.Count; i++)
                {
                    string strTerm = tokens.tokenList[i].strTerm.Trim();
                    if (strTerm.Length > 0)
                    {
                        sb.Append(strTerm);
                        sb.Append(" ");
                    }
                }
                strText = sb.ToString().Trim();

                LMResult LMRst = lmDecoder.GetSentProb(strText, order);

                if (sw == null)
                {
                    Console.WriteLine("{0}\t{1}\t{2}\t{3}", strText, LMRst.logProb, LMRst.oovs, LMRst.perplexity);
                }
                else
                {
                    sw.WriteLine("{0}\t{1}\t{2}\t{3}", strText, LMRst.logProb, LMRst.oovs, LMRst.perplexity);
                }
            }

            if (sr != null)
            {
                sr.Close();
            }
            if (sw != null)
            {
                sw.Close();
            }
        }
Example #2
0
        static void Main(string[] args)
        {
            if (args.Length != 6)
            {
                Console.WriteLine("BuildQueryTermWeightCorpus.exe [Min frequency in query] [Query Segment Labels] [Min Segment Gap] [Lexical dictionary file name] [Query term weight score file name] [Training corpus file name]");
                return;
            }

            int minFreq = int.Parse(args[0]);

            string[] labItems = args[1].Split(',');

            MAX_THRESHOLD_NUM    = labItems.Length - 1;
            MIN_WEIGHT_SCORE_GAP = double.Parse(args[2]);

            wordseg = new WordSeg.WordSeg();
            //Load lexical dictionary
            wordseg.LoadLexicalDict(args[3], true);
            //Initialize word breaker's token instance
            wbTokens = wordseg.CreateTokens();

            if (File.Exists(args[4]) == false)
            {
                Console.WriteLine("Query term weight file {0} is not existed.", args[4]);
                return;
            }

            StreamReader sr = new StreamReader(args[4]);
            StreamWriter sw = new StreamWriter(args[5]);

            while (sr.EndOfStream == false)
            {
                string   strLine     = sr.ReadLine();
                string[] items       = strLine.Split('\t');
                string   strRawQuery = items[0];
                int      queryFreq   = int.Parse(items[1]);

                //Ignore queries with less frequency
                if (int.Parse(items[1]) < minFreq)
                {
                    continue;
                }

                try
                {
                    //Get query features
                    SortedDictionary <double, int> sdict = new SortedDictionary <double, int>();
                    double maxWeight = -1.0;
                    for (int i = 2; i < items.Length; i++)
                    {
                        if (items[i].Trim().Length == 0)
                        {
                            //Ignore empty term
                            continue;
                        }

                        //Parse item and get term string and its weight score
                        int    pos     = items[i].IndexOf('[');
                        string strTerm = items[i].Substring(0, pos).Trim().ToLower();
                        if (strTerm.Length == 0)
                        {
                            continue;
                        }

                        string strWeight = items[i].Substring(pos + 1, items[i].Length - (pos + 1) - 1);
                        double fWeight   = double.Parse(strWeight);
                        if (sdict.ContainsKey(fWeight) == false)
                        {
                            sdict.Add(fWeight, 0);
                        }
                        sdict[fWeight] += strTerm.Length;

                        //Try to get maximum weight score in current query
                        if (fWeight >= maxWeight)
                        {
                            maxWeight = fWeight;
                        }
                    }

                    if (maxWeight < 1.0)
                    {
                        continue;
                    }

                    //Sort weight score list
                    List <ScoreItem> scoreList = new List <ScoreItem>();
                    foreach (KeyValuePair <double, int> pair in sdict.Reverse())
                    {
                        ScoreItem scoreItem = new ScoreItem();
                        scoreItem.score      = pair.Key;
                        scoreItem.bThreshold = false;
                        scoreItem.gap        = 0.0;

                        scoreList.Add(scoreItem);
                    }

                    //Find topN max threshold gaps
                    List <double> thresholdGapList = CalcTopNMaxThresholdGap(scoreList, MAX_THRESHOLD_NUM, MIN_WEIGHT_SCORE_GAP);
                    if (thresholdGapList.Count != MAX_THRESHOLD_NUM)
                    {
                        //If the number of threshold gap is different with the specified number in command line, ignore this query
                        continue;
                    }

                    if (thresholdGapList.Count > 0)
                    {
                        int coreCnt  = 0;
                        int otherCnt = 0;
                        //Count the number of core terms
                        foreach (KeyValuePair <double, int> pair in sdict)
                        {
                            if (pair.Key > thresholdGapList[0])
                            {
                                coreCnt += pair.Value;
                            }
                            else
                            {
                                otherCnt += pair.Value;
                            }
                        }
                        if (coreCnt < 2)
                        {
                            continue;
                        }
                    }


                    List <Token> tkList = new List <Token>();
                    for (int i = 2; i < items.Length; i++)
                    {
                        if (items[i].Trim().Length == 0)
                        {
                            continue;
                        }

                        //Parse training corpus
                        int    pos     = items[i].IndexOf('[');
                        string strTerm = items[i].Substring(0, pos).Trim().ToLower();
                        if (strTerm.Length == 0)
                        {
                            continue;
                        }

                        string strWeight = items[i].Substring(pos + 1, items[i].Length - (pos + 1) - 1);
                        double fWeight   = double.Parse(strWeight);

                        Token tk = new Token();
                        tk.fWeight = fWeight;
                        tk.strTerm = strTerm;


                        bool bProcessed = false;
                        //Label tags according term's weight and thresholds
                        for (int j = 0; j < thresholdGapList.Count; j++)
                        {
                            if (fWeight > thresholdGapList[j])
                            {
                                tk.strTag  = labItems[j];
                                bProcessed = true;
                                break;
                            }
                        }

                        //Label the last part
                        if (bProcessed == false)
                        {
                            int j = thresholdGapList.Count;
                            tk.strTag = labItems[j];
                        }

                        tkList.Add(tk);
                    }

                    //Merge contigous terms with same tag
                    tkList = MergeTokenList(tkList);
                    //Re-segment terms by given lexical dictionary
                    tkList = ResegmentTokenList(tkList);
                    string strOutput = "";

                    //Generate term important level corpus
                    foreach (Token tk in tkList)
                    {
                        string strTag = tk.strTag;
                        strOutput += tk.strTerm + "[" + strTag + "] ";
                    }
                    sw.WriteLine("{0}\t{1}\t{2}", strRawQuery, queryFreq, strOutput.Trim());
                }
                catch (Exception)
                {
                    Console.WriteLine("Invalidated sentence: {0}", strLine);
                }
            }

            sr.Close();
            sw.Close();
        }