Example #1
0
        public static List <Pair> CutWord(String sentance)
        {
            PosSegmenter       PosSeg   = new PosSegmenter();
            IEnumerable <Pair> res_pair = PosSeg.Cut(sentance);

            return(res_pair.ToList());
        }
Example #2
0
        public void TestCut()
        {
            var seg    = new JiebaSegmenter();
            var posSeg = new PosSegmenter(seg);

            TestCutFunction(posSeg.Cut, true, @"Cases\pos_cut_hmm.txt");
        }
Example #3
0
        public EntitySegmenter()
        {
            if (pos_tagger == null)
            {
                movie_name = ReadEntityFromFile(data_path + movie_filename);
                movie_name.UnionWith(ReadEntityFromFile(data_path + movie_nosplite_filename));  // additional
                artist_name   = ReadEntityFromFile(data_path + artist_filename);
                director_name = ReadEntityFromFile(data_path + director_filename);
                country_name  = ReadEntityFromFile(data_path + country_filename);
                genre_name    = ReadEntityFromFile(data_path + genre_filename);

                // NOTE:
                // it seems the later PosSegmenter will overlap the former one, i.e. director
                // will overlay artist when the artist have the same name with director
                // even we use "new PosSegment(segment_xxx)".
                // this issue is caused by the static _wordTagTab in PosSegmenter.cs in Jieba.NET
                JiebaSegmenter segmenter = new JiebaSegmenter();
                segmenter.LoadUserDict(data_path + movie_filename);
                segmenter.LoadUserDict(data_path + movie_nosplite_filename);        // additional
                //segmenter.LoadUserDict(data_path + movieindeepdomain_filename);   // additional
                segmenter.LoadUserDict(data_path + artist_filename);
                segmenter.LoadUserDict(data_path + director_filename);
                segmenter.LoadUserDict(data_path + celebrityindeepdomain_filename); // additional
                segmenter.LoadUserDict(data_path + country_filename);
                segmenter.LoadUserDict(data_path + genre_filename);
                pos_tagger = new PosSegmenter(segmenter);
            }
        }
Example #4
0
    public static void CompanyAnlayze()
    {
        var posSeg = new PosSegmenter();
        //甲方乙方首单词统计
        var FirstWordPos = new Dictionary <String, int>();
        var WordLength   = new Dictionary <int, int>();

        FDDC.Program.Logger.WriteLine("甲方乙方统计:");
        PropertyWordAnlayze.Init();
        foreach (var contract in Traning.ContractList)
        {
            PropertyWordAnlayze.PutWord(contract.JiaFang);
            PropertyWordAnlayze.PutWord(contract.YiFang);
        }
        PropertyWordAnlayze.WriteToLog();

        FDDC.Program.Logger.WriteLine("合同统计:");
        PropertyWordAnlayze.Init();
        foreach (var contract in Traning.ContractList)
        {
            PropertyWordAnlayze.PutWord(contract.ContractName);
        }
        PropertyWordAnlayze.WriteToLog();

        FDDC.Program.Logger.WriteLine("工程统计:");
        PropertyWordAnlayze.Init();
        foreach (var contract in Traning.ContractList)
        {
            PropertyWordAnlayze.PutWord(contract.ProjectName);
        }
        PropertyWordAnlayze.WriteToLog();
    }
Example #5
0
        /// <summary>
        /// 对单段分词的子线程函数
        /// </summary>
        private void workCutParagraph(int nowNum)
        {
            PosSegmenter segmenter = new PosSegmenter();

            print(string.Format("正在对第{0}段分词(共{1}段,{2}%)",
                                nowNum,
                                dc.preResult.Count,
                                Math.Round((double)nowNum * 100.0 / dc.preResult.Count,
                                           2)
                                ));
            try
            {
                for (int j = 0; j < dc.preResult[nowNum].Count; j++)
                {
                    try
                    {
                        //对单句分词
                        var words = segmenter.Cut(dc.preResult[nowNum][j]);
                        //标注句子索引
                        Sentence s = new Sentence(nowNum, j, words);

                        dc.sentences.Add(s);
                    }
                    catch
                    {
                        Sentence s = new Sentence(nowNum, j, new List <Pair>());
                        dc.sentences.Add(s);
                    }
                }
            }
            catch (Exception ex)
            {
                print("分词失败:" + ex.Message);
            }
        }
Example #6
0
        public void TestCutWithouHmm()
        {
            var seg    = new JiebaSegmenter();
            var posSeg = new PosSegmenter(seg);

            TestCutFunction(posSeg.Cut, false, TestHelper.GetCaseFilePath("pos_cut_no_hmm.txt"));
        }
Example #7
0
 void InitTokenSegmenter()
 {
     Trace.WriteLine("InitTokenSegmenter Thread Start.");
     updating = true;
     try
     {
         using (ApplicationDbContext db = new ApplicationDbContext())
         {
             var list = app.MemoryCache.Get("match_groups", () => db.MatchGroups.ToList());
             updatedSegmenter = new PosSegmenter();
             foreach (var item in list)
             {
                 updatedSegmenter.AddWord(item.Name, 99999, ((int)item.Type).ToString() + "|" + item.Id);
             }
             File.WriteAllLines(user_dict_txt, list.Select(x => x.Name + "|" + (int)x.Type + "|" + x.Id).ToArray());
             user_dict_load = true;
         }
         segmenter        = updatedSegmenter;
         updatedSegmenter = null;
     }
     finally
     {
         Trace.WriteLine("InitTokenSegmenter Thread End.");
         updating = false;
     }
 }
Example #8
0
        public void TestCutInParallel()
        {
            var seg    = new JiebaSegmenter();
            var posSeg = new PosSegmenter(seg);

            TestParallelCutFunction(posSeg.CutInParallel, true, TestHelper.GetCaseFilePath("pos_cut_hmm.txt"));
        }
Example #9
0
    /// <summary>
    /// 将一个项目根据连词分割为两项
    /// </summary>
    /// <param name="OrgString"></param>
    /// <returns></returns>
    public static List <String> CutByPOSConection(string OrgString)
    {
        var pos         = new PosSegmenter();
        var words       = pos.Cut(OrgString);
        var rtn         = new List <String>();
        var currentword = "";

        foreach (var item in words)
        {
            if (item.Flag == LTPTrainingNER.连词)
            {
                if (!String.IsNullOrEmpty(currentword))
                {
                    rtn.Add(currentword);
                    currentword = "";
                }
            }
            else
            {
                currentword += item.Word;
            }
        }
        if (!String.IsNullOrEmpty(currentword))
        {
            rtn.Add(currentword);
            currentword = "";
        }
        return(rtn);
    }
Example #10
0
        public void TestCutWithouHmm()
        {
            var seg    = new JiebaSegmenter();
            var posSeg = new PosSegmenter(seg);

            TestCutFunction(posSeg.Cut, false, @"Cases\pos_cut_no_hmm.txt");
        }
Example #11
0
        /// <summary>
        /// 快速测试区
        /// </summary>
        private static void QuickTestArea()
        {
            var plst = LTPTrainingNER.GetParagraghList(StockChangePath_TEST + "/ner/18877033.xml");

            CompanyNameLogic.GetCompanyNameByNerInfo(plst);
            return;

            var s0    = "爱康科技向爱康实业、爱康国际、苏州度金、天地国际、钨业研究支付现金购买其合计持有爱康光电100%股权";
            var pos   = new PosSegmenter();
            var words = pos.Cut(s0);

            Evaluator = new StreamWriter("Evaluator.log");
            Score     = new StreamWriter("Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt");
            //Evaluate.EvaluateReorganizationByFile(@"E:\WorkSpace2018\FDDC2018\FDDC_SRC\Result\chongzu_train.txt");
            //Score.Close();
            //Evaluator.Close();

            //TraningDataset.InitReorganization();
            ReOrganizationTraning.EvaluateMethodList = new string[] {
                "收益法", "资产基础法", "市场法", "市场比较法", "估值法", "成本法", "现金流折现法", "现金流折现法", "剩余法",
                "内含价值调整法", "可比公司市净率法", "重置成本法", "收益现值法", "基础资产法", "假设清偿法",
                "成本逼近法", "单项资产加和法", "成本加和法", "基准地价修正法", "收益还原法", "现金流量法", "单项资产加总法", "折现现金流量法", "基准地价系数修正法"
            }.ToList();
            var t = new Reorganization();

            t.Id           = "748379";
            t.HTMLFileName = ReorganizationPath_TEST + "/html/1759374.html";
            //t.TextFileName = ContractPath_TEST + "/txt/128869.txt";
            //t.NerXMLFileName = ContractPath_TEST + "/ner/128869.xml";
            t.Init();
            var recs = t.Extract();
            var s1   = recs[0].ConvertToString();
        }
Example #12
0
    public static void RunWordAnlayze()
    {
        var s0 = "华陆工程(科技)有限责任公司";
        JiebaSegmenter segmenter = new JiebaSegmenter();
        segmenter.AddWord("华陆工程科技有限责任公司");
        segmenter.AddWord("中煤陕西榆林能源化工有限公司");
        PosSegmenter posSeg = new PosSegmenter(segmenter);
        var c = posSeg.Cut(s0);
        s0 = s0.NormalizeTextResult();
        s0 = RegularTool.TrimBrackets(s0);
       /*  var SProjectName = new Surround();
        var root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html");
        var Contract = TraningDataset.GetContractById("1044779")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html");
        Contract = TraningDataset.GetContractById("1450")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html");
        Contract = TraningDataset.GetContractById("1042224")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html");
        Contract = TraningDataset.GetContractById("917362")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);
        SProjectName.WriteTop(10); */
        var TestString = "承运市";
        var pos = new JiebaNet.Segmenter.PosSeg.PosSegmenter();
        foreach (var item in pos.Cut(TestString))
        {
            Console.WriteLine(item.Word + ":" + item.Flag);
        }
    }
Example #13
0
        private static void SegmentFile(Options options)
        {
            var result = new List <string>();

            var fileName = Path.GetFullPath(options.FileName);
            var lines    = File.ReadAllLines(fileName);

            Func <string, bool, bool, IEnumerable <string> > cutMethod = null;
            var segmenter = new JiebaSegmenter();

            if (options.POS)
            {
                cutMethod = (text, cutAll, hmm) =>
                {
                    var posSeg = new PosSegmenter(segmenter);
                    return(posSeg.Cut(text, hmm).Select(token => string.Format("{0}/{1}", token.Word, token.Flag)));
                };
            }
            else
            {
                cutMethod = segmenter.Cut;
            }

            var delimiter = string.IsNullOrWhiteSpace(options.Delimiter) ? "/ " : options.Delimiter;

            foreach (var line in lines)
            {
                result.Add(string.Join(delimiter, cutMethod(line, options.CutAll, options.NoHmm)));
            }
            Console.WriteLine(string.Join(Environment.NewLine, result));
        }
Example #14
0
        private static void TestPosSegmenterCut(string text)
        {
            var posSeg = new PosSegmenter();
            var tokens = posSeg.Cut(text);
            var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}"));

            Console.WriteLine(result);
        }
        protected void Page_Load(object sender, EventArgs e)
        {
            var posSeg = new PosSegmenter();
            var s      = "就算你留恋开放在水中娇艳的水仙,别忘了寂寞的山谷里角落里野百合也有春天";
            var tokens = posSeg.Cut(s);

            Response.Write(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}</br>", token.Word, token.Flag))));
        }
Example #16
0
        public void TestCutNames()
        {
            var posSeg = new PosSegmenter();
            var tokens = posSeg.Cut("吉林的省会是长春");
            var result = string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag)));

            Console.WriteLine(result);
        }
Example #17
0
        public void PosCutDemo()
        {
            var posSeg = new PosSegmenter();
            var s      = "一团硕大无朋的高能离子云,在遥远而神秘的太空中迅疾地飘移";

            var tokens = posSeg.Cut(s);

            Console.WriteLine(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag))));
        }
Example #18
0
        public IEnumerable <Pair> jieba(string text)
        {
            //http://localhost:5000/ef/jieba?text=%E4%BD%A0%E5%A5%BD%E8%BF%99%E6%98%AF%E9%94%99%E8%AF%AF%E7%9A%84
            JiebaSegmenter     segmenter    = new JiebaSegmenter();
            PosSegmenter       posSegmenter = new PosSegmenter(segmenter);
            IEnumerable <Pair> wordList     = posSegmenter.Cut(text);

            return(wordList);
        }
Example #19
0
 static Preprocessor()
 {
     seg    = new JiebaSegmenter();
     posSeg = new PosSegmenter(seg);
     //载入用户词典
     seg.LoadUserDict(user_dict_path);
     //停用词
     stopWordList = GetStopWords();
 }
Example #20
0
        private void Init()
        {
            if (posSeg == null)
            {
                string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString();
                AppDomain.CurrentDomain.SetData("JiebaConfigFileDir", contentDir);

                posSeg = new PosSegmenter();
            }
        }
Example #21
0
        public TextRankExtractor()
        {
            Span = 5;

            Segmenter    = new JiebaSegmenter();
            PosSegmenter = new PosSegmenter(Segmenter);
            SetStopWords(ConfigManager.StopWordsFile);
            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }
        }
Example #22
0
        /// <summary>
        /// 文本排序
        /// </summary>
        public TextRankExtractor()
        {
            Span = 5;

            Segmenter    = new Segmenter();
            PosSegmenter = new PosSegmenter(Segmenter);

            StopWords = Dict.StopWords;

            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }
        }
Example #23
0
        //助教所指adj "精通" "熟悉"等实为动词,这里提取v
        private static List <string> getAdjs(string quals)
        {
            var           segmenter = new PosSegmenter();
            var           tokens    = segmenter.Cut(quals);
            List <string> words     = new List <string>();

            foreach (var token in tokens)
            {
                if (token.Flag == "v")
                {
                    words.Add(token.Word);
                }
            }
            return(words);
        }
Example #24
0
        public TfidfExtractor()
        {
            Segmenter    = new JiebaSegmenter();
            PosSegmenter = new PosSegmenter(Segmenter);
            SetStopWords(ConfigManager.StopWordsFile);
            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }

            Loader = new IdfLoader(DefaultIdfFile);

            IdfFreq   = Loader.IdfFreq;
            MedianIdf = Loader.MedianIdf;
        }
Example #25
0
        private IDictionary <string, double> ExtractTagRank(string text, IEnumerable <string> allowPos)
        {
            if (allowPos.IsEmpty())
            {
                allowPos = DefaultPosFilter;
            }

            var g     = new UndirectWeightedGraph();
            var cm    = new Dictionary <string, int>();
            var words = PosSegmenter.Cut(text).ToList();

            for (var i = 0; i < words.Count(); i++)
            {
                var wp = words[i];
                if (!PairFilter(allowPos, wp))
                {
                    continue;
                }
                for (var j = i + 1; j < i + Span; j++)
                {
                    if (j >= words.Count)
                    {
                        break;
                    }
                    if (!PairFilter(allowPos, words[j]))
                    {
                        continue;
                    }

                    // TODO: better separator.
                    var key = wp.Word + "$" + words[j].Word;
                    if (!cm.ContainsKey(key))
                    {
                        cm[key] = 0;
                    }
                    cm[key] += 1;
                }
            }

            foreach (var p in cm)
            {
                var terms = p.Key.Split('$');
                g.AddEdge(terms[0], terms[1], p.Value);
            }

            return(g.Rank());
        }
Example #26
0
        public TfidfExtractor(JiebaSegmenter segmenter = null)
        {
            Segmenter    = segmenter.IsNull() ? new JiebaSegmenter() : segmenter;
            PosSegmenter = new PosSegmenter(Segmenter);
            //SetStopWords(ConfigManager.StopWordsFile);
            SetFromResources();
            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }

            //Loader = new IdfLoader(DefaultIdfFile);
            Loader = new IdfLoader();
            Loader.LoadFromResources();


            IdfFreq   = Loader.IdfFreq;
            MedianIdf = Loader.MedianIdf;
        }
Example #27
0
    /// <summary>
    /// 去掉动词 + 组词结构
    /// </summary>
    /// <param name="OrgString"></param>
    /// <returns></returns>
    string TrimUJWords(string OrgString)
    {
        var pos   = new PosSegmenter();
        var s1    = pos.Cut(OrgString).ToList();
        var ujidx = -1;

        for (int i = 0; i < s1.Count(); i++)
        {
            if (s1[i].Flag == "uj")
            {
                if (i - 1 >= 0 && s1[i - 1].Flag == "v")
                {
                    ujidx = i;
                    break;
                }
            }
            if (s1[i].Flag == "v" && s1[i].Word.Equals("购买"))
            {
                if (i + 1 < s1.Count && s1[i + 1].Flag != "uj")
                {
                    ujidx = i;
                    break;
                }
            }
        }
        var after = "";

        if (ujidx != -1)
        {
            for (int i = ujidx + 1; i < s1.Count(); i++)
            {
                after += s1[i].Word;
            }
        }
        else
        {
            return(OrgString);
        }
        //Console.WriteLine("Before TrimUJ:" + OrgString);
        //Console.WriteLine("After TrimUJ:" + after);
        return(after);
    }
Example #28
0
        public static LanAndLon GetLatAndLonByTitle(string title)
        {
            List <LanAndLon>   lll      = new List <LanAndLon>();
            PosSegmenter       PosSeg   = new PosSegmenter();
            IEnumerable <Pair> res_pair = PosSeg.Cut(title);

            //Console.WriteLine(res_pair.ToString());
            foreach (Pair item in res_pair)
            {
                if (item.Flag == "ns")
                {
                    lll.Add(GetLatAndLonByWord(item.Word));
                }
            }
            lll.OrderBy(ll => ll.level);
            if (lll.Count == 0)
            {
                return(new LanAndLon());
            }
            return(lll.Last());
        }
Example #29
0
    //实体自身特性分析
    public static void EntityWordPerperty()
    {
        var posSeg = new PosSegmenter();
        //首单词统计
        var FirstWordPos = new Dictionary <String, int>();
        var WordLength   = new Dictionary <int, int>();

        Program.Training.WriteLine("甲方统计:");
        EntityWordAnlayzeTool.Init();
        foreach (var contract in TraningDataset.ContractList)
        {
            EntityWordAnlayzeTool.PutEntityWordPerperty(contract.JiaFang);
        }
        EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog();

        Program.Training.WriteLine("乙方统计:");
        EntityWordAnlayzeTool.Init();
        foreach (var contract in TraningDataset.ContractList)
        {
            EntityWordAnlayzeTool.PutEntityWordPerperty(contract.YiFang);
        }
        EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog();


        Program.Training.WriteLine("合同统计:");
        EntityWordAnlayzeTool.Init();
        foreach (var contract in TraningDataset.ContractList)
        {
            EntityWordAnlayzeTool.PutEntityWordPerperty(contract.ContractName);
        }
        EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog();

        Program.Training.WriteLine("工程统计:");
        EntityWordAnlayzeTool.Init();
        foreach (var contract in TraningDataset.ContractList)
        {
            EntityWordAnlayzeTool.PutEntityWordPerperty(contract.ProjectName);
        }
        EntityWordAnlayzeTool.WriteFirstAndLengthWordToLog();
    }
Example #30
0
        /// <summary>
        /// TF-IDF
        /// </summary>
        /// <param name="segmenter"></param>
        public TfidfExtractor(Segmenter segmenter = null)
        {
            if (segmenter.IsNull())
            {
                Segmenter = new Segmenter();
            }
            else
            {
                Segmenter = segmenter;
            }
            PosSegmenter = new PosSegmenter(Segmenter);
            StopWords    = Dict.StopWords;
            if (StopWords.IsEmpty())
            {
                StopWords.UnionWith(DefaultStopWords);
            }

            Loader = new IdfLoader();

            IdfFreq   = Loader.IdfFreq;
            MedianIdf = Loader.MedianIdf;
        }