Ejemplo n.º 1
0
 private void InitializeWordSeg(string strLexicalFileName)
 {
     wordseg = new WordSeg.WordSeg();
     //Load lexical dictionary
     wordseg.LoadLexicalDict(strLexicalFileName, true);
     //Initialize word breaker's token instance
     wbTokens = wordseg.CreateTokens();
 }
Ejemplo n.º 2
0
        static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                Console.WriteLine("GenerateTermSynPattern.exe [input:word breaker lexical file] [input:query_clusterId_freq_clickweight file] [output:term_syn_pattern file]");
                return;
            }

            wordseg = new WordSeg.WordSeg();
            wordseg.LoadLexicalDict(args[0], true);
            //Create tokens which is local-thread structure
            //The max word segment legnth is MAX_SEGMENT_LENGTH
            tokens = wordseg.CreateTokens();


            StreamReader     sr         = new StreamReader(args[1]);
            List <QueryItem> qiList     = new List <QueryItem>();
            string           strLastUrl = "";
            long             recordCnt  = 0;

            while (sr.EndOfStream == false)
            {
                string strLine = sr.ReadLine();
                recordCnt++;

                string[]  items        = strLine.Split('\t');
                string    strClusterId = items[1];
                QueryItem qi           = new QueryItem();

                qi.strQuery = items[0];
                // qi.strQuery = JPNUtils.ToHalfKana(qi.strQuery);
                qi.strQuery    = JPNUtils.ToDBC(qi.strQuery);
                qi.strQuery    = qi.strQuery.Replace(" ", "");
                qi.strQuery    = qi.strQuery.ToLower();
                qi.freq        = int.Parse(items[2]);
                qi.clickweight = double.Parse(items[3]);

                ////Query url whose frequency or clickweight is too low will be ignored.
                //if (qi.clickweight < 5.0)
                //{
                //    continue;
                //}

                if (strLastUrl == "")
                {
                    qiList.Add(qi);
                }
                else
                {
                    if (strLastUrl == strClusterId)
                    {
                        qiList.Add(qi);
                    }
                    else
                    {
                        //If too many unique queries click the same url,
                        //The url may be low quaility, since it has too many
                        //different meaning, so ignore this cluster
                        if (qiList.Count < 100)
                        {
                            StatPatternList(qiList);
                        }
                        qiList.Clear();
                        qiList.Add(qi);
                    }
                }
                strLastUrl = strClusterId;

                if (recordCnt % 10000000 == 0)
                {
                    Console.WriteLine("Process {0} records...", recordCnt);
                    UpdateSaveTermSyn(args[2]);
                }
            }

            UpdateSaveTermSyn(args[2]);

            sr.Close();
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Tokenizate given corpus. The output corpus format is "word1 word2 ... wordN \t frequency"
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            if (args.Length < 3 || args.Length > 4)
            {
                Console.WriteLine("tokenization.exe [lexical dictionary] [input file] [output file] <min-frequency>");
                Console.WriteLine("  min-frequency: default value is 1");
                return;
            }

            int minFreq = 1;

            if (args.Length == 4)
            {
                minFreq = int.Parse(args[3]);
            }

            WordSeg.WordSeg wordseg = new WordSeg.WordSeg();
            WordSeg.Tokens  tokens  = null;

            //Load lexical dictionary with raw text format
            if (File.Exists(args[0]) == false)
            {
                Console.WriteLine("lexical dictionary isn't existed.");
                return;
            }
            wordseg.LoadLexicalDict(args[0], true);
            tokens = wordseg.CreateTokens();

            if (File.Exists(args[1]) == false)
            {
                Console.WriteLine("{0} isn't existed.", args[1]);
                return;
            }

            StreamReader sr      = new StreamReader(args[1], Encoding.UTF8);
            StreamWriter sw      = new StreamWriter(args[2], false, Encoding.UTF8);
            string       strLine = null;
            long         lineCnt = 0;

            while ((strLine = sr.ReadLine()) != null)
            {
                string[] items = strLine.Split('\t');
                long     freq  = 1;
                if (items.Length == 2)
                {
                    //Normalize frequency for smoothing when building LM
                    freq = long.Parse(items[1]) - (minFreq - 1);

                    if (freq <= 0)
                    {
                        continue;
                    }
                }

                lineCnt++;
                if (lineCnt % 100000 == 0)
                {
                    Console.Write("{0}...", lineCnt);
                }

                //Simple normalize text
                string strQuery = items[0].ToLower().Trim();

                //Segment text by lexical dictionary
                wordseg.Segment(strQuery, tokens, false);
                StringBuilder sb = new StringBuilder();
                //Parse each broken token
                for (int i = 0; i < tokens.tokenList.Count; i++)
                {
                    string strTerm = tokens.tokenList[i].strTerm.Trim();
                    if (strTerm.Length > 0)
                    {
                        sb.Append(strTerm);
                        sb.Append(" ");
                    }
                }

                //output token with begin/end flag
                sw.WriteLine("{0}\t{1}", sb.ToString().Trim(), freq);
            }

            sr.Close();
            sw.Close();
        }