private static List <WordResult[]> resultSegment(string src) { String s; WordSegment ws = getWordSegmentInstance(); lock (ws) { s = Utility.Traditional2Simplified(src); s = Utility.ToDBC(s); return(ws.Segment(s, ms_nKind)); } }
/// <summary> /// 分词 /// </summary> /// <param name="sentence"></param> /// <returns></returns> public string[] Segment(string sentence) { List <string> list = new List <string>(); List <WordResult[]> result = wordSegment.Segment(sentence, nKind); //int n = Utility.GetPOSValue("n"); for (int i = 0; i < result.Count; i++) { for (int j = 1; j < result[i].Length - 1; j++) { //if (result[i][j].nPOS == n) //{ list.Add(result[i][j].sWord); //} } } return(list.ToArray()); }
//线程的执行函数,for循环中线程根据自身的ID,加2取文本文件,保证两个线程的输入文件没有交集 private static void AnalyFuc(object order) { int num = ((Para)order).Num; WordSegment wordSegment = new WordSegment(); wordSegment.InitWordSegment(DictPath); StreamReader sr = null; StreamWriter sw = new StreamWriter(outDir + num + ".txt", false, System.Text.Encoding.Default); for (int i = num; i < fileList.Count; i += 2) { sr = new StreamReader(fileList[i], System.Text.Encoding.Default); string input = ""; input = sr.ReadLine(); List <WordResult[]> result = null; while (input != null) { if (input == "") { input = sr.ReadLine(); continue; } try { result = wordSegment.Segment(input); } catch (Exception e) { Console.WriteLine(e.Message); continue; } for (int j = 1; j < result[0].Length - 1; ++j) { sw.Write(result[0][j].sWord + " "); } sw.WriteLine(""); input = sr.ReadLine(); } sr.Close(); } sw.Close(); }
/// <summary> /// 添加新分词 /// </summary> /// <param name="words"></param> //public void AddNewWord(List<string> words) //{ // //string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + // // Path.DirectorySeparatorChar; // WordDictionary dict = new WordDictionary(); // var dic = Core.Helper.IOHelper.GetMapPath("~/App_Data/Dict/"); // dict.Load(dic + "coreDict.dct"); // foreach (var word in words) // { // dict.AddItem(word, Utility.GetPOSValue("n"), 10); // } // dict.Save(dic + "coreDictNew.dct"); //} /// <summary> /// 分词 /// </summary> /// <param name="str">需分词的文本</param> /// <returns>分词结果</returns> public List <string> DoSegment(string str) { str = str.Trim(); List <string> result = new List <string>(); var r = wordSegment.Segment(str); for (int i = 0; i < r.Count; i++) { for (int j = 1; j < r[i].Length - 1; j++) { if (result.IndexOf(r[i][j].sWord.ToLower()) == -1) { if (!result.Contains(r[i][j].sWord.ToLower())) { result.Add(r[i][j].sWord.ToLower()); } } } } return(result); }
public SearchResult Search(string str) { Dictionary <string, int> ID; int N; SqlConnection conn = new SqlConnection(connstr); conn.Open(); if (lastVisit == Functions.end) { ID = cacheID; N = cacheN; } else { SqlCommand cmd = new SqlCommand("SELECT COUNT(*) FROM [data]", conn); N = (int)cmd.ExecuteScalar(); ID = new Dictionary <string, int>(); cmd = new SqlCommand("SELECT [id],[word] FROM [Index]", conn); SqlDataReader dr = cmd.ExecuteReader(); while (dr.Read()) { ID.Add(((string)dr["word"]).Trim(), (int)dr["id"]); } dr.Close(); cacheID = ID; cacheN = N; lastVisit = Functions.end; } //str = PreProcessUtility.ToSimplifyString(str); str = str.Replace("\r", " ").Replace("\n", " ").Replace("/", " / ").Replace(":", " : ").Replace("<", " < ").Replace(">", " > ").Replace("\"", " \" "); str = str.Replace("&", " & ").Replace("(", " ( ").Replace(")", " ) ").Replace("-", " - ").Replace("'", " ' "); string[] words = WordResultToString(wordSegment.Segment(str, 1)); int[] results = null; double[] tfidf = null; for (int i = 0; i < words.Length; i++) { if (!ID.ContainsKey(words[i].ToLower())) { results = null; break; } SqlCommand cmd = new SqlCommand("SELECT [df], [tf], [posts] FROM [Index] WHERE [ID]=@id", conn); cmd.Parameters.Add("id", SqlDbType.Int).Value = ID[words[i].ToLower()]; SqlDataReader dr = cmd.ExecuteReader(); if (dr.Read()) { int[] tf = BytesToInts((byte[])dr["tf"]); int[] posts = BytesToInts((byte[])dr["posts"]); int df = (int)dr["df"]; double[] tfidf2 = getTfIdf(tf, df, N); if (results == null) { results = posts; tfidf = tfidf2; } else { results = CombineLists(results, posts, tfidf, tfidf2, out tfidf); } } dr.Close(); } conn.Close(); SearchResult ret = new SearchResult(); if (results == null) { ret.documentID = new int[0]; ret.tfidf = new double[0]; } else { Array.Sort(tfidf, results); Array.Reverse(tfidf); Array.Reverse(results); ret.documentID = results; ret.tfidf = tfidf; } ret.words = words; return(ret); }
//======================================================= // 开始分词 //======================================================= public List <WordResult[]> Segment(string sentence) { return(wordSegment.Segment(sentence, nKind)); }