private static List <WordResult[]> resultSegment(string src)
        {
            String      s;
            WordSegment ws = getWordSegmentInstance();

            lock (ws)
            {
                s = Utility.Traditional2Simplified(src);
                s = Utility.ToDBC(s);
                return(ws.Segment(s, ms_nKind));
            }
        }
Пример #2
0
        /// <summary>
        /// 分词
        /// </summary>
        /// <param name="sentence"></param>
        /// <returns></returns>
        public string[] Segment(string sentence)
        {
            List <string>       list   = new List <string>();
            List <WordResult[]> result = wordSegment.Segment(sentence, nKind);

            //int n = Utility.GetPOSValue("n");
            for (int i = 0; i < result.Count; i++)
            {
                for (int j = 1; j < result[i].Length - 1; j++)
                {
                    //if (result[i][j].nPOS == n)
                    //{
                    list.Add(result[i][j].sWord);
                    //}
                }
            }
            return(list.ToArray());
        }
Пример #3
0
        //线程的执行函数,for循环中线程根据自身的ID,加2取文本文件,保证两个线程的输入文件没有交集
        private static void AnalyFuc(object order)
        {
            int         num         = ((Para)order).Num;
            WordSegment wordSegment = new WordSegment();

            wordSegment.InitWordSegment(DictPath);
            StreamReader sr = null;
            StreamWriter sw = new StreamWriter(outDir + num + ".txt", false, System.Text.Encoding.Default);

            for (int i = num; i < fileList.Count; i += 2)
            {
                sr = new StreamReader(fileList[i], System.Text.Encoding.Default);
                string input = "";
                input = sr.ReadLine();
                List <WordResult[]> result = null;
                while (input != null)
                {
                    if (input == "")
                    {
                        input = sr.ReadLine();
                        continue;
                    }
                    try
                    {
                        result = wordSegment.Segment(input);
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e.Message);
                        continue;
                    }
                    for (int j = 1; j < result[0].Length - 1; ++j)
                    {
                        sw.Write(result[0][j].sWord + " ");
                    }
                    sw.WriteLine("");
                    input = sr.ReadLine();
                }
                sr.Close();
            }

            sw.Close();
        }
Пример #4
0
        /// <summary>
        /// 添加新分词
        /// </summary>
        /// <param name="words"></param>
        //public void AddNewWord(List<string> words)
        //{
        //    //string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") +
        //    //             Path.DirectorySeparatorChar;

        //    WordDictionary dict = new WordDictionary();
        //    var dic = Core.Helper.IOHelper.GetMapPath("~/App_Data/Dict/");
        //    dict.Load(dic + "coreDict.dct");
        //    foreach (var word in words)
        //    {
        //        dict.AddItem(word, Utility.GetPOSValue("n"), 10);
        //    }
        //    dict.Save(dic + "coreDictNew.dct");
        //}

        /// <summary>
        /// 分词
        /// </summary>
        /// <param name="str">需分词的文本</param>
        /// <returns>分词结果</returns>
        public List <string> DoSegment(string str)
        {
            str = str.Trim();
            List <string> result = new List <string>();
            var           r      = wordSegment.Segment(str);

            for (int i = 0; i < r.Count; i++)
            {
                for (int j = 1; j < r[i].Length - 1; j++)
                {
                    if (result.IndexOf(r[i][j].sWord.ToLower()) == -1)
                    {
                        if (!result.Contains(r[i][j].sWord.ToLower()))
                        {
                            result.Add(r[i][j].sWord.ToLower());
                        }
                    }
                }
            }
            return(result);
        }
Пример #5
0
        public SearchResult Search(string str)
        {
            Dictionary <string, int> ID;
            int N;

            SqlConnection conn = new SqlConnection(connstr);

            conn.Open();

            if (lastVisit == Functions.end)
            {
                ID = cacheID;
                N  = cacheN;
            }
            else
            {
                SqlCommand cmd = new SqlCommand("SELECT COUNT(*) FROM [data]", conn);
                N = (int)cmd.ExecuteScalar();

                ID  = new Dictionary <string, int>();
                cmd = new SqlCommand("SELECT [id],[word] FROM [Index]", conn);
                SqlDataReader dr = cmd.ExecuteReader();
                while (dr.Read())
                {
                    ID.Add(((string)dr["word"]).Trim(), (int)dr["id"]);
                }
                dr.Close();

                cacheID   = ID;
                cacheN    = N;
                lastVisit = Functions.end;
            }

            //str = PreProcessUtility.ToSimplifyString(str);
            str = str.Replace("\r", " ").Replace("\n", " ").Replace("/", " / ").Replace(":", " : ").Replace("<", " < ").Replace(">", " > ").Replace("\"", " \" ");
            str = str.Replace("&", " & ").Replace("(", " ( ").Replace(")", " ) ").Replace("-", " - ").Replace("'", " ' ");

            string[] words = WordResultToString(wordSegment.Segment(str, 1));

            int[]    results = null;
            double[] tfidf   = null;

            for (int i = 0; i < words.Length; i++)
            {
                if (!ID.ContainsKey(words[i].ToLower()))
                {
                    results = null;
                    break;
                }
                SqlCommand cmd = new SqlCommand("SELECT [df], [tf], [posts] FROM [Index] WHERE [ID]=@id", conn);
                cmd.Parameters.Add("id", SqlDbType.Int).Value = ID[words[i].ToLower()];
                SqlDataReader dr = cmd.ExecuteReader();
                if (dr.Read())
                {
                    int[] tf    = BytesToInts((byte[])dr["tf"]);
                    int[] posts = BytesToInts((byte[])dr["posts"]);
                    int   df    = (int)dr["df"];

                    double[] tfidf2 = getTfIdf(tf, df, N);

                    if (results == null)
                    {
                        results = posts;
                        tfidf   = tfidf2;
                    }
                    else
                    {
                        results = CombineLists(results, posts, tfidf, tfidf2, out tfidf);
                    }
                }
                dr.Close();
            }

            conn.Close();


            SearchResult ret = new SearchResult();

            if (results == null)
            {
                ret.documentID = new int[0];
                ret.tfidf      = new double[0];
            }
            else
            {
                Array.Sort(tfidf, results);
                Array.Reverse(tfidf);
                Array.Reverse(results);
                ret.documentID = results;
                ret.tfidf      = tfidf;
            }
            ret.words = words;

            return(ret);
        }
Пример #6
0
 //=======================================================
 // 开始分词
 //=======================================================
 public List <WordResult[]> Segment(string sentence)
 {
     return(wordSegment.Segment(sentence, nKind));
 }