public static List <Pair> CutWord(String sentance) { PosSegmenter PosSeg = new PosSegmenter(); IEnumerable <Pair> res_pair = PosSeg.Cut(sentance); return(res_pair.ToList()); }
public void TestCut() { var seg = new JiebaSegmenter(); var posSeg = new PosSegmenter(seg); TestCutFunction(posSeg.Cut, true, @"Cases\pos_cut_hmm.txt"); }
public EntitySegmenter() { if (pos_tagger == null) { movie_name = ReadEntityFromFile(data_path + movie_filename); movie_name.UnionWith(ReadEntityFromFile(data_path + movie_nosplite_filename)); // additional artist_name = ReadEntityFromFile(data_path + artist_filename); director_name = ReadEntityFromFile(data_path + director_filename); country_name = ReadEntityFromFile(data_path + country_filename); genre_name = ReadEntityFromFile(data_path + genre_filename); // NOTE: // it seems the later PosSegmenter will overlap the former one, i.e. director // will overlay artist when the artist have the same name with director // even we use "new PosSegment(segment_xxx)". // this issue is caused by the static _wordTagTab in PosSegmenter.cs in Jieba.NET JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.LoadUserDict(data_path + movie_filename); segmenter.LoadUserDict(data_path + movie_nosplite_filename); // additional //segmenter.LoadUserDict(data_path + movieindeepdomain_filename); // additional segmenter.LoadUserDict(data_path + artist_filename); segmenter.LoadUserDict(data_path + director_filename); segmenter.LoadUserDict(data_path + celebrityindeepdomain_filename); // additional segmenter.LoadUserDict(data_path + country_filename); segmenter.LoadUserDict(data_path + genre_filename); pos_tagger = new PosSegmenter(segmenter); } }
public static void CompanyAnlayze() { var posSeg = new PosSegmenter(); //甲方乙方首单词统计 var FirstWordPos = new Dictionary <String, int>(); var WordLength = new Dictionary <int, int>(); FDDC.Program.Logger.WriteLine("甲方乙方统计:"); PropertyWordAnlayze.Init(); foreach (var contract in Traning.ContractList) { PropertyWordAnlayze.PutWord(contract.JiaFang); PropertyWordAnlayze.PutWord(contract.YiFang); } PropertyWordAnlayze.WriteToLog(); FDDC.Program.Logger.WriteLine("合同统计:"); PropertyWordAnlayze.Init(); foreach (var contract in Traning.ContractList) { PropertyWordAnlayze.PutWord(contract.ContractName); } PropertyWordAnlayze.WriteToLog(); FDDC.Program.Logger.WriteLine("工程统计:"); PropertyWordAnlayze.Init(); foreach (var contract in Traning.ContractList) { PropertyWordAnlayze.PutWord(contract.ProjectName); } PropertyWordAnlayze.WriteToLog(); }
/// <summary> /// 对单段分词的子线程函数 /// </summary> private void workCutParagraph(int nowNum) { PosSegmenter segmenter = new PosSegmenter(); print(string.Format("正在对第{0}段分词(共{1}段,{2}%)", nowNum, dc.preResult.Count, Math.Round((double)nowNum * 100.0 / dc.preResult.Count, 2) )); try { for (int j = 0; j < dc.preResult[nowNum].Count; j++) { try { //对单句分词 var words = segmenter.Cut(dc.preResult[nowNum][j]); //标注句子索引 Sentence s = new Sentence(nowNum, j, words); dc.sentences.Add(s); } catch { Sentence s = new Sentence(nowNum, j, new List <Pair>()); dc.sentences.Add(s); } } } catch (Exception ex) { print("分词失败:" + ex.Message); } }
public void TestCutWithouHmm() { var seg = new JiebaSegmenter(); var posSeg = new PosSegmenter(seg); TestCutFunction(posSeg.Cut, false, TestHelper.GetCaseFilePath("pos_cut_no_hmm.txt")); }
void InitTokenSegmenter() { Trace.WriteLine("InitTokenSegmenter Thread Start."); updating = true; try { using (ApplicationDbContext db = new ApplicationDbContext()) { var list = app.MemoryCache.Get("match_groups", () => db.MatchGroups.ToList()); updatedSegmenter = new PosSegmenter(); foreach (var item in list) { updatedSegmenter.AddWord(item.Name, 99999, ((int)item.Type).ToString() + "|" + item.Id); } File.WriteAllLines(user_dict_txt, list.Select(x => x.Name + "|" + (int)x.Type + "|" + x.Id).ToArray()); user_dict_load = true; } segmenter = updatedSegmenter; updatedSegmenter = null; } finally { Trace.WriteLine("InitTokenSegmenter Thread End."); updating = false; } }
public void TestCutInParallel() { var seg = new JiebaSegmenter(); var posSeg = new PosSegmenter(seg); TestParallelCutFunction(posSeg.CutInParallel, true, TestHelper.GetCaseFilePath("pos_cut_hmm.txt")); }
/// <summary> /// 将一个项目根据连词分割为两项 /// </summary> /// <param name="OrgString"></param> /// <returns></returns> public static List <String> CutByPOSConection(string OrgString) { var pos = new PosSegmenter(); var words = pos.Cut(OrgString); var rtn = new List <String>(); var currentword = ""; foreach (var item in words) { if (item.Flag == LTPTrainingNER.连词) { if (!String.IsNullOrEmpty(currentword)) { rtn.Add(currentword); currentword = ""; } } else { currentword += item.Word; } } if (!String.IsNullOrEmpty(currentword)) { rtn.Add(currentword); currentword = ""; } return(rtn); }
public void TestCutWithouHmm() { var seg = new JiebaSegmenter(); var posSeg = new PosSegmenter(seg); TestCutFunction(posSeg.Cut, false, @"Cases\pos_cut_no_hmm.txt"); }
/// <summary> /// 快速测试区 /// </summary> private static void QuickTestArea() { var plst = LTPTrainingNER.GetParagraghList(StockChangePath_TEST + "/ner/18877033.xml"); CompanyNameLogic.GetCompanyNameByNerInfo(plst); return; var s0 = "爱康科技向爱康实业、爱康国际、苏州度金、天地国际、钨业研究支付现金购买其合计持有爱康光电100%股权"; var pos = new PosSegmenter(); var words = pos.Cut(s0); Evaluator = new StreamWriter("Evaluator.log"); Score = new StreamWriter("Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt"); //Evaluate.EvaluateReorganizationByFile(@"E:\WorkSpace2018\FDDC2018\FDDC_SRC\Result\chongzu_train.txt"); //Score.Close(); //Evaluator.Close(); //TraningDataset.InitReorganization(); ReOrganizationTraning.EvaluateMethodList = new string[] { "收益法", "资产基础法", "市场法", "市场比较法", "估值法", "成本法", "现金流折现法", "现金流折现法", "剩余法", "内含价值调整法", "可比公司市净率法", "重置成本法", "收益现值法", "基础资产法", "假设清偿法", "成本逼近法", "单项资产加和法", "成本加和法", "基准地价修正法", "收益还原法", "现金流量法", "单项资产加总法", "折现现金流量法", "基准地价系数修正法" }.ToList(); var t = new Reorganization(); t.Id = "748379"; t.HTMLFileName = ReorganizationPath_TEST + "/html/1759374.html"; //t.TextFileName = ContractPath_TEST + "/txt/128869.txt"; //t.NerXMLFileName = ContractPath_TEST + "/ner/128869.xml"; t.Init(); var recs = t.Extract(); var s1 = recs[0].ConvertToString(); }
public static void RunWordAnlayze() { var s0 = "华陆工程(科技)有限责任公司"; JiebaSegmenter segmenter = new JiebaSegmenter(); segmenter.AddWord("华陆工程科技有限责任公司"); segmenter.AddWord("中煤陕西榆林能源化工有限公司"); PosSegmenter posSeg = new PosSegmenter(segmenter); var c = posSeg.Cut(s0); s0 = s0.NormalizeTextResult(); s0 = RegularTool.TrimBrackets(s0); /* var SProjectName = new Surround(); var root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html"); var Contract = TraningDataset.GetContractById("1044779")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html"); Contract = TraningDataset.GetContractById("1450")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html"); Contract = TraningDataset.GetContractById("1042224")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html"); Contract = TraningDataset.GetContractById("917362")[0]; SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName); SProjectName.WriteTop(10); */ var TestString = "承运市"; var pos = new JiebaNet.Segmenter.PosSeg.PosSegmenter(); foreach (var item in pos.Cut(TestString)) { Console.WriteLine(item.Word + ":" + item.Flag); } }
private static void SegmentFile(Options options) { var result = new List <string>(); var fileName = Path.GetFullPath(options.FileName); var lines = File.ReadAllLines(fileName); Func <string, bool, bool, IEnumerable <string> > cutMethod = null; var segmenter = new JiebaSegmenter(); if (options.POS) { cutMethod = (text, cutAll, hmm) => { var posSeg = new PosSegmenter(segmenter); return(posSeg.Cut(text, hmm).Select(token => string.Format("{0}/{1}", token.Word, token.Flag))); }; } else { cutMethod = segmenter.Cut; } var delimiter = string.IsNullOrWhiteSpace(options.Delimiter) ? "/ " : options.Delimiter; foreach (var line in lines) { result.Add(string.Join(delimiter, cutMethod(line, options.CutAll, options.NoHmm))); } Console.WriteLine(string.Join(Environment.NewLine, result)); }
private static void TestPosSegmenterCut(string text) { var posSeg = new PosSegmenter(); var tokens = posSeg.Cut(text); var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}")); Console.WriteLine(result); }
protected void Page_Load(object sender, EventArgs e) { var posSeg = new PosSegmenter(); var s = "就算你留恋开放在水中娇艳的水仙,别忘了寂寞的山谷里角落里野百合也有春天"; var tokens = posSeg.Cut(s); Response.Write(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}</br>", token.Word, token.Flag)))); }
public void TestCutNames() { var posSeg = new PosSegmenter(); var tokens = posSeg.Cut("吉林的省会是长春"); var result = string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag))); Console.WriteLine(result); }
public void PosCutDemo() { var posSeg = new PosSegmenter(); var s = "一团硕大无朋的高能离子云,在遥远而神秘的太空中迅疾地飘移"; var tokens = posSeg.Cut(s); Console.WriteLine(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag)))); }
public IEnumerable <Pair> jieba(string text) { //http://localhost:5000/ef/jieba?text=%E4%BD%A0%E5%A5%BD%E8%BF%99%E6%98%AF%E9%94%99%E8%AF%AF%E7%9A%84 JiebaSegmenter segmenter = new JiebaSegmenter(); PosSegmenter posSegmenter = new PosSegmenter(segmenter); IEnumerable <Pair> wordList = posSegmenter.Cut(text); return(wordList); }
static Preprocessor() { seg = new JiebaSegmenter(); posSeg = new PosSegmenter(seg); //载入用户词典 seg.LoadUserDict(user_dict_path); //停用词 stopWordList = GetStopWords(); }
private void Init() { if (posSeg == null) { string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString(); AppDomain.CurrentDomain.SetData("JiebaConfigFileDir", contentDir); posSeg = new PosSegmenter(); } }
public TextRankExtractor() { Span = 5; Segmenter = new JiebaSegmenter(); PosSegmenter = new PosSegmenter(Segmenter); SetStopWords(ConfigManager.StopWordsFile); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } }
/// <summary> /// 文本排序 /// </summary> public TextRankExtractor() { Span = 5; Segmenter = new Segmenter(); PosSegmenter = new PosSegmenter(Segmenter); StopWords = Dict.StopWords; if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } }
//助教所指adj "精通" "熟悉"等实为动词,这里提取v private static List <string> getAdjs(string quals) { var segmenter = new PosSegmenter(); var tokens = segmenter.Cut(quals); List <string> words = new List <string>(); foreach (var token in tokens) { if (token.Flag == "v") { words.Add(token.Word); } } return(words); }
public TfidfExtractor() { Segmenter = new JiebaSegmenter(); PosSegmenter = new PosSegmenter(Segmenter); SetStopWords(ConfigManager.StopWordsFile); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } Loader = new IdfLoader(DefaultIdfFile); IdfFreq = Loader.IdfFreq; MedianIdf = Loader.MedianIdf; }
private IDictionary <string, double> ExtractTagRank(string text, IEnumerable <string> allowPos) { if (allowPos.IsEmpty()) { allowPos = DefaultPosFilter; } var g = new UndirectWeightedGraph(); var cm = new Dictionary <string, int>(); var words = PosSegmenter.Cut(text).ToList(); for (var i = 0; i < words.Count(); i++) { var wp = words[i]; if (!PairFilter(allowPos, wp)) { continue; } for (var j = i + 1; j < i + Span; j++) { if (j >= words.Count) { break; } if (!PairFilter(allowPos, words[j])) { continue; } // TODO: better separator. var key = wp.Word + "$" + words[j].Word; if (!cm.ContainsKey(key)) { cm[key] = 0; } cm[key] += 1; } } foreach (var p in cm) { var terms = p.Key.Split('$'); g.AddEdge(terms[0], terms[1], p.Value); } return(g.Rank()); }
public TfidfExtractor(JiebaSegmenter segmenter = null) { Segmenter = segmenter.IsNull() ? new JiebaSegmenter() : segmenter; PosSegmenter = new PosSegmenter(Segmenter); //SetStopWords(ConfigManager.StopWordsFile); SetFromResources(); if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } //Loader = new IdfLoader(DefaultIdfFile); Loader = new IdfLoader(); Loader.LoadFromResources(); IdfFreq = Loader.IdfFreq; MedianIdf = Loader.MedianIdf; }
/// <summary> /// 去掉动词 + 组词结构 /// </summary> /// <param name="OrgString"></param> /// <returns></returns> string TrimUJWords(string OrgString) { var pos = new PosSegmenter(); var s1 = pos.Cut(OrgString).ToList(); var ujidx = -1; for (int i = 0; i < s1.Count(); i++) { if (s1[i].Flag == "uj") { if (i - 1 >= 0 && s1[i - 1].Flag == "v") { ujidx = i; break; } } if (s1[i].Flag == "v" && s1[i].Word.Equals("购买")) { if (i + 1 < s1.Count && s1[i + 1].Flag != "uj") { ujidx = i; break; } } } var after = ""; if (ujidx != -1) { for (int i = ujidx + 1; i < s1.Count(); i++) { after += s1[i].Word; } } else { return(OrgString); } //Console.WriteLine("Before TrimUJ:" + OrgString); //Console.WriteLine("After TrimUJ:" + after); return(after); }
public static LanAndLon GetLatAndLonByTitle(string title) { List <LanAndLon> lll = new List <LanAndLon>(); PosSegmenter PosSeg = new PosSegmenter(); IEnumerable <Pair> res_pair = PosSeg.Cut(title); //Console.WriteLine(res_pair.ToString()); foreach (Pair item in res_pair) { if (item.Flag == "ns") { lll.Add(GetLatAndLonByWord(item.Word)); } } lll.OrderBy(ll => ll.level); if (lll.Count == 0) { return(new LanAndLon()); } return(lll.Last()); }
//实体自身特性分析 public static void EntityWordPerperty() { var posSeg = new PosSegmenter(); //首单词统计 var FirstWordPos = new Dictionary <String, int>(); var WordLength = new Dictionary <int, int>(); Program.Training.WriteLine("甲方统计:"); EntityWordAnlayzeTool.Init(); foreach (var contract in TraningDataset.ContractList) { EntityWordAnlayzeTool.PutEntityWordPerperty(contract.JiaFang); } EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog(); Program.Training.WriteLine("乙方统计:"); EntityWordAnlayzeTool.Init(); foreach (var contract in TraningDataset.ContractList) { EntityWordAnlayzeTool.PutEntityWordPerperty(contract.YiFang); } EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog(); Program.Training.WriteLine("合同统计:"); EntityWordAnlayzeTool.Init(); foreach (var contract in TraningDataset.ContractList) { EntityWordAnlayzeTool.PutEntityWordPerperty(contract.ContractName); } EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog(); Program.Training.WriteLine("工程统计:"); EntityWordAnlayzeTool.Init(); foreach (var contract in TraningDataset.ContractList) { EntityWordAnlayzeTool.PutEntityWordPerperty(contract.ProjectName); } EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog(); }
/// <summary> /// TF-IDF /// </summary> /// <param name="segmenter"></param> public TfidfExtractor(Segmenter segmenter = null) { if (segmenter.IsNull()) { Segmenter = new Segmenter(); } else { Segmenter = segmenter; } PosSegmenter = new PosSegmenter(Segmenter); StopWords = Dict.StopWords; if (StopWords.IsEmpty()) { StopWords.UnionWith(DefaultStopWords); } Loader = new IdfLoader(); IdfFreq = Loader.IdfFreq; MedianIdf = Loader.MedianIdf; }