コード例 #1
0
ファイル: EntitySelf.cs プロジェクト: lxxxjd/liangzhimodel
 public void InitFactorItem()
 {
     FirstWordPosFactorItem           = new FactorItem <String>();
     FirstWordPosFactorItem.Transform = (x) => posSeg.Cut(x).First().Flag;
     WordLengtFactorItem           = new  FactorItem <int>();
     WordLengtFactorItem.Transform = (x) => x.Length;
     WordCountFactorItem           = new FactorItem <int> ();
     WordCountFactorItem.Transform = (x) => posSeg.Cut(x).Count();
     LastWordFactorItem            = new FactorItem <String>();
     LastWordFactorItem.Transform  = (x) => posSeg.Cut(x).Last().Word;
 }
コード例 #2
0
    public static void PutWord(string Word)
    {
        if (String.IsNullOrEmpty(Word))
        {
            return;
        }
        var words = posSeg.Cut(Word);

        if (words.Count() > 0)
        {
            var pos = words.First().Flag;
            if (FirstWordPos.ContainsKey(pos))
            {
                FirstWordPos[pos] = FirstWordPos[pos] + 1;
            }
            else
            {
                FirstWordPos.Add(pos, 1);
            }
            var wl = Word.Length;
            if (WordLength.ContainsKey(wl))
            {
                WordLength[wl] = WordLength[wl] + 1;
            }
            else
            {
                WordLength.Add(wl, 1);
            }
        }
    }
コード例 #3
0
        private static void SegmentFile(Options options)
        {
            var result = new List <string>();

            var fileName = Path.GetFullPath(options.FileName);
            var lines    = File.ReadAllLines(fileName);

            Func <string, bool, bool, IEnumerable <string> > cutMethod = null;
            var segmenter = new JiebaSegmenter();

            if (options.POS)
            {
                cutMethod = (text, cutAll, hmm) =>
                {
                    var posSeg = new PosSegmenter(segmenter);
                    return(posSeg.Cut(text, hmm).Select(token => string.Format("{0}/{1}", token.Word, token.Flag)));
                };
            }
            else
            {
                cutMethod = segmenter.Cut;
            }

            var delimiter = string.IsNullOrWhiteSpace(options.Delimiter) ? "/ " : options.Delimiter;

            foreach (var line in lines)
            {
                result.Add(string.Join(delimiter, cutMethod(line, options.CutAll, options.NoHmm)));
            }
            Console.WriteLine(string.Join(Environment.NewLine, result));
        }
コード例 #4
0
ファイル: TokenStrHelper.cs プロジェクト: guangxb/RpcOverHttp
        public List <Pair> TokenStr(List <RpcServiceCollection.MatchGroup> matchGroups, string str)
        {
            Trace.WriteLine("TokenStr called from thread " + Thread.CurrentThread.ManagedThreadId);
            if (!user_dict_load)
            {
                lock (segmenterLock)
                {
                    if (!user_dict_load && LoadThread == null)
                    {
                        LoadThread = new Thread(InitTokenSegmenter);
                        LoadThread.Start();
                        return(new List <Pair>());
                    }
                }
            }
            var last_write_time = File.GetLastWriteTime(user_dict_txt);
            var update_time     = matchGroups.Max(x => x.UpdateTime);

            if (DateTime.Now.Subtract(last_write_time).TotalMinutes > 10 || update_time > last_write_time)
            {
                lock (segmenterLock)
                {
                    if (!updating)
                    {
                        LoadThread = new Thread(InitTokenSegmenter);
                        LoadThread.Start();
                    }
                }
            }
            var values = segmenter.Cut(str, true);

            return(values.ToList());
        }
コード例 #5
0
    /// <summary>
    /// 将一个项目根据连词分割为两项
    /// </summary>
    /// <param name="OrgString"></param>
    /// <returns></returns>
    public static List <String> CutByPOSConection(string OrgString)
    {
        var pos         = new PosSegmenter();
        var words       = pos.Cut(OrgString);
        var rtn         = new List <String>();
        var currentword = "";

        foreach (var item in words)
        {
            if (item.Flag == LTPTrainingNER.连词)
            {
                if (!String.IsNullOrEmpty(currentword))
                {
                    rtn.Add(currentword);
                    currentword = "";
                }
            }
            else
            {
                currentword += item.Word;
            }
        }
        if (!String.IsNullOrEmpty(currentword))
        {
            rtn.Add(currentword);
            currentword = "";
        }
        return(rtn);
    }
コード例 #6
0
ファイル: Program.cs プロジェクト: loooo139/gradute
        /// <summary>
        /// 快速测试区
        /// </summary>
        private static void QuickTestArea()
        {
            var plst = LTPTrainingNER.GetParagraghList(StockChangePath_TEST + "/ner/18877033.xml");

            CompanyNameLogic.GetCompanyNameByNerInfo(plst);
            return;

            var s0    = "爱康科技向爱康实业、爱康国际、苏州度金、天地国际、钨业研究支付现金购买其合计持有爱康光电100%股权";
            var pos   = new PosSegmenter();
            var words = pos.Cut(s0);

            Evaluator = new StreamWriter("Evaluator.log");
            Score     = new StreamWriter("Result" + Path.DirectorySeparatorChar + "Score" + Path.DirectorySeparatorChar + "score" + System.DateTime.Now.ToString("yyyyMMddHHmmss") + ".txt");
            //Evaluate.EvaluateReorganizationByFile(@"E:\WorkSpace2018\FDDC2018\FDDC_SRC\Result\chongzu_train.txt");
            //Score.Close();
            //Evaluator.Close();

            //TraningDataset.InitReorganization();
            ReOrganizationTraning.EvaluateMethodList = new string[] {
                "收益法", "资产基础法", "市场法", "市场比较法", "估值法", "成本法", "现金流折现法", "现金流折现法", "剩余法",
                "内含价值调整法", "可比公司市净率法", "重置成本法", "收益现值法", "基础资产法", "假设清偿法",
                "成本逼近法", "单项资产加和法", "成本加和法", "基准地价修正法", "收益还原法", "现金流量法", "单项资产加总法", "折现现金流量法", "基准地价系数修正法"
            }.ToList();
            var t = new Reorganization();

            t.Id           = "748379";
            t.HTMLFileName = ReorganizationPath_TEST + "/html/1759374.html";
            //t.TextFileName = ContractPath_TEST + "/txt/128869.txt";
            //t.NerXMLFileName = ContractPath_TEST + "/ner/128869.xml";
            t.Init();
            var recs = t.Extract();
            var s1   = recs[0].ConvertToString();
        }
コード例 #7
0
    public static void RunWordAnlayze()
    {
        var s0 = "华陆工程(科技)有限责任公司";
        JiebaSegmenter segmenter = new JiebaSegmenter();
        segmenter.AddWord("华陆工程科技有限责任公司");
        segmenter.AddWord("中煤陕西榆林能源化工有限公司");
        PosSegmenter posSeg = new PosSegmenter(segmenter);
        var c = posSeg.Cut(s0);
        s0 = s0.NormalizeTextResult();
        s0 = RegularTool.TrimBrackets(s0);
       /*  var SProjectName = new Surround();
        var root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1044779.html");
        var Contract = TraningDataset.GetContractById("1044779")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1450.html");
        Contract = TraningDataset.GetContractById("1450")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\1042224.html");
        Contract = TraningDataset.GetContractById("1042224")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);

        root = HTMLEngine.Anlayze(Program.DocBase + @"\FDDC_announcements_round1_train_20180518\round1_train_20180518\重大合同\html\917362.html");
        Contract = TraningDataset.GetContractById("917362")[0];
        SProjectName.AnlayzeEntitySurroundWords(root, Contract.ProjectName);
        SProjectName.WriteTop(10); */
        var TestString = "承运市";
        var pos = new JiebaNet.Segmenter.PosSeg.PosSegmenter();
        foreach (var item in pos.Cut(TestString))
        {
            Console.WriteLine(item.Word + ":" + item.Flag);
        }
    }
コード例 #8
0
        public static List <Pair> CutWord(String sentance)
        {
            PosSegmenter       PosSeg   = new PosSegmenter();
            IEnumerable <Pair> res_pair = PosSeg.Cut(sentance);

            return(res_pair.ToList());
        }
コード例 #9
0
ファイル: WordCut.cs プロジェクト: schifflee/NovelAnalysis
        /// <summary>
        /// 对单段分词的子线程函数
        /// </summary>
        private void workCutParagraph(int nowNum)
        {
            PosSegmenter segmenter = new PosSegmenter();

            print(string.Format("正在对第{0}段分词(共{1}段,{2}%)",
                                nowNum,
                                dc.preResult.Count,
                                Math.Round((double)nowNum * 100.0 / dc.preResult.Count,
                                           2)
                                ));
            try
            {
                for (int j = 0; j < dc.preResult[nowNum].Count; j++)
                {
                    try
                    {
                        //对单句分词
                        var words = segmenter.Cut(dc.preResult[nowNum][j]);
                        //标注句子索引
                        Sentence s = new Sentence(nowNum, j, words);

                        dc.sentences.Add(s);
                    }
                    catch
                    {
                        Sentence s = new Sentence(nowNum, j, new List <Pair>());
                        dc.sentences.Add(s);
                    }
                }
            }
            catch (Exception ex)
            {
                print("分词失败:" + ex.Message);
            }
        }
コード例 #10
0
        private static void TestPosSegmenterCut(string text)
        {
            var posSeg = new PosSegmenter();
            var tokens = posSeg.Cut(text);
            var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}"));

            Console.WriteLine(result);
        }
コード例 #11
0
        protected void Page_Load(object sender, EventArgs e)
        {
            var posSeg = new PosSegmenter();
            var s      = "就算你留恋开放在水中娇艳的水仙,别忘了寂寞的山谷里角落里野百合也有春天";
            var tokens = posSeg.Cut(s);

            Response.Write(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}</br>", token.Word, token.Flag))));
        }
コード例 #12
0
        public void TestCutNames()
        {
            var posSeg = new PosSegmenter();
            var tokens = posSeg.Cut("吉林的省会是长春");
            var result = string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag)));

            Console.WriteLine(result);
        }
コード例 #13
0
        public IEnumerable <Pair> jieba(string text)
        {
            //http://localhost:5000/ef/jieba?text=%E4%BD%A0%E5%A5%BD%E8%BF%99%E6%98%AF%E9%94%99%E8%AF%AF%E7%9A%84
            JiebaSegmenter     segmenter    = new JiebaSegmenter();
            PosSegmenter       posSegmenter = new PosSegmenter(segmenter);
            IEnumerable <Pair> wordList     = posSegmenter.Cut(text);

            return(wordList);
        }
コード例 #14
0
ファイル: TestDemo.cs プロジェクト: zyj0021/jieba.NET
        public void PosCutDemo()
        {
            var posSeg = new PosSegmenter();
            var s      = "一团硕大无朋的高能离子云,在遥远而神秘的太空中迅疾地飘移";

            var tokens = posSeg.Cut(s);

            Console.WriteLine(string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag))));
        }
コード例 #15
0
        /// <summary>
        /// 分词、词性标注
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public static List <MicroBlogCalendar.Model.Pair> Cut(string text)
        {
            var collection = posSeg.Cut(text);
            List <MicroBlogCalendar.Model.Pair> list = new List <Model.Pair>(collection.Count());

            foreach (var item in collection)
            {
                Model.Pair p = new Model.Pair(item.Word, item.Flag);
                list.Add(p);
            }
            return(list);
        }
コード例 #16
0
ファイル: JiebaTagger.cs プロジェクト: Spongebob5/_
        public void Tag(Sentence sentence, TagOptions options)
        {
            Init();

            var tokens = posSeg.Cut(sentence.Text).ToList();

            for (int i = 0; i < sentence.Words.Count; i++)
            {
                sentence.Words[i].Pos = tokens[i].Flag;
                sentence.Words[i].Tag = tokens[i].Flag;
            }
        }
コード例 #17
0
        /// <summary>
        /// 利用CRF进行命名实体标记
        /// </summary>
        /// <param name="newsId"></param>
        public void doNamedEntityRecognitionByCRF(int newsId)
        {
            if (newsId < 0)
            {
                return;
            }
            this.newsId = newsId;
            this.sectionList.Clear();
            this.entityMentionList.Clear();
            this.entityMap.Clear();
            //清空panel
            this.richTextBox1.Text = "";
            News news = DBHelper.db.Queryable <News>().InSingle(newsId);

            // 将新闻按段落分组
            string[] sections = news.content.Split(' ');
            int      secIndex = 0;

            foreach (string sectionValue in sections)
            {
                if (sectionValue == "")
                {
                    continue;
                }
                var tokens = posSeg.Cut(sectionValue);
                //FileStream fs = new FileStream("E:\\asda.txt", FileMode.Append);
                //StreamWriter sw = new StreamWriter(fs);
                //String line = "";
                //foreach (Pair word in tokens)
                //{
                //    line = line + word.Word +"/"+ word.Flag + " ";
                //}
                //sw.Write(line.Substring(0,line.Length-1)+"\n");
                //sw.Flush();
                ////关闭流
                //sw.Close();
                //fs.Close();
                // 生成实例存入sectionList,用于存入数据库
                var section = new Section();
                section.newsId      = news.id;
                section.indexInNews = secIndex;
                section.value       = sectionValue;
                sectionList.Add(section);
                // 识别命名实体
                string content = string.Join(" ", tokens.Select(token => string.Format("{0}/{1}", token.Word, token.Flag)));
                splitEntity(secIndex, content);
                // 下一section在新闻中的位置
                secIndex += tokens.Count();
            }
        }
コード例 #18
0
        public void TestAddWord()
        {
            var seg = new JiebaSegmenter();

            var posSeg = new PosSegmenter(seg);
            var tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList();
            var result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}"));

            Console.WriteLine(result);
            var lastToken = tokens.Last();

            Assert.That(lastToken.Word, Is.EqualTo("处理"));

            seg.AddWord("自然语言处理", tag: "n");
            tokens = posSeg.Cut("小明最近在学习自然语言处理").ToList();
            result = string.Join(" ", tokens.Select(token => $"{token.Word}/{token.Flag}"));
            Console.WriteLine(result);
            lastToken = tokens.Last();
            Assert.That(lastToken.Word, Is.EqualTo("自然语言处理"));
            Assert.That(lastToken.Flag, Is.EqualTo("n"));

            seg.DeleteWord("自然语言处理");
        }
コード例 #19
0
ファイル: Parser.cs プロジェクト: Superplasma/GuapiGraph
        //助教所指adj "精通" "熟悉"等实为动词,这里提取v
        private static List <string> getAdjs(string quals)
        {
            var           segmenter = new PosSegmenter();
            var           tokens    = segmenter.Cut(quals);
            List <string> words     = new List <string>();

            foreach (var token in tokens)
            {
                if (token.Flag == "v")
                {
                    words.Add(token.Word);
                }
            }
            return(words);
        }
コード例 #20
0
        private IDictionary <string, double> ExtractTagRank(string text, IEnumerable <string> allowPos)
        {
            if (allowPos.IsEmpty())
            {
                allowPos = DefaultPosFilter;
            }

            var g     = new UndirectWeightedGraph();
            var cm    = new Dictionary <string, int>();
            var words = PosSegmenter.Cut(text).ToList();

            for (var i = 0; i < words.Count(); i++)
            {
                var wp = words[i];
                if (!PairFilter(allowPos, wp))
                {
                    continue;
                }
                for (var j = i + 1; j < i + Span; j++)
                {
                    if (j >= words.Count)
                    {
                        break;
                    }
                    if (!PairFilter(allowPos, words[j]))
                    {
                        continue;
                    }

                    // TODO: better separator.
                    var key = wp.Word + "$" + words[j].Word;
                    if (!cm.ContainsKey(key))
                    {
                        cm[key] = 0;
                    }
                    cm[key] += 1;
                }
            }

            foreach (var p in cm)
            {
                var terms = p.Key.Split('$');
                g.AddEdge(terms[0], terms[1], p.Value);
            }

            return(g.Rank());
        }
コード例 #21
0
ファイル: Reorganization.cs プロジェクト: loooo139/gradute
    /// <summary>
    /// 去掉动词 + 组词结构
    /// </summary>
    /// <param name="OrgString"></param>
    /// <returns></returns>
    string TrimUJWords(string OrgString)
    {
        var pos   = new PosSegmenter();
        var s1    = pos.Cut(OrgString).ToList();
        var ujidx = -1;

        for (int i = 0; i < s1.Count(); i++)
        {
            if (s1[i].Flag == "uj")
            {
                if (i - 1 >= 0 && s1[i - 1].Flag == "v")
                {
                    ujidx = i;
                    break;
                }
            }
            if (s1[i].Flag == "v" && s1[i].Word.Equals("购买"))
            {
                if (i + 1 < s1.Count && s1[i + 1].Flag != "uj")
                {
                    ujidx = i;
                    break;
                }
            }
        }
        var after = "";

        if (ujidx != -1)
        {
            for (int i = ujidx + 1; i < s1.Count(); i++)
            {
                after += s1[i].Word;
            }
        }
        else
        {
            return(OrgString);
        }
        //Console.WriteLine("Before TrimUJ:" + OrgString);
        //Console.WriteLine("After TrimUJ:" + after);
        return(after);
    }
コード例 #22
0
ファイル: GetLanAndLon.cs プロジェクト: yanwen0614/PM
        public static LanAndLon GetLatAndLonByTitle(string title)
        {
            List <LanAndLon>   lll      = new List <LanAndLon>();
            PosSegmenter       PosSeg   = new PosSegmenter();
            IEnumerable <Pair> res_pair = PosSeg.Cut(title);

            //Console.WriteLine(res_pair.ToString());
            foreach (Pair item in res_pair)
            {
                if (item.Flag == "ns")
                {
                    lll.Add(GetLatAndLonByWord(item.Word));
                }
            }
            lll.OrderBy(ll => ll.level);
            if (lll.Count == 0)
            {
                return(new LanAndLon());
            }
            return(lll.Last());
        }
コード例 #23
0
ファイル: BussinessLogic.cs プロジェクト: kimmow/FDDC
    //词法分析

    public static List <String> GetProjectName(HTMLEngine.MyRootHtmlNode root)
    {
        var posSeg   = new PosSegmenter();
        var namelist = new List <String>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                var words = posSeg.Cut(sentence.Content).ToList();
                for (int baseInd = 0; baseInd < words.Count; baseInd++)
                {
                    if (words[baseInd].Word == "标段" ||
                        words[baseInd].Word == "工程" ||
                        words[baseInd].Word == "项目")
                    {
                        var projectName = "";
                        //是否能够在前面找到地名
                        for (int NRIdx = baseInd; NRIdx > -1; NRIdx--)
                        {
                            //地理
                            if (words[NRIdx].Flag == "ns")
                            {
                                projectName = "";
                                for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++)
                                {
                                    projectName += words[companyFullNameInd].Word;
                                }
                                namelist.Add(projectName);
                                break;  //不要继续寻找地名了
                            }
                        }
                    }
                }
            }
        }
        return(namelist);
    }
コード例 #24
0
 public static string MostLikeCompanyName(List <string> CandidateWords)
 {
     foreach (var word in CandidateWords)
     {
         if (string.IsNullOrEmpty(word))
         {
             continue;
         }
         var posSeg = new PosSegmenter();
         var cuts   = posSeg.Cut(word).ToList();
         if (cuts[0].Flag == LTPTrainingNER.地名)
         {
             if (word.EndsWith("公司") || word.Contains("有限合伙"))
             {
                 return(word);
             }
         }
     }
     if (CandidateWords.Count == 0)
     {
         return(String.Empty);
     }
     return(CandidateWords[0]);
 }
コード例 #25
0
        public static List <WordPair> cut(string str, CutTool tool = CutTool.jieba)
        {
            try
            {
                str = ChineseStringUtility.ToSimplified(str);
                str = removeBlanks(str);
                List <WordPair> res = new List <WordPair>();
                List <Pair>     tmp = new List <Pair>();
                switch (tool)
                {
                case CutTool.jieba:
                    //jieba分词
                    PosSegmenter p = new PosSegmenter();
                    tmp = p.Cut(str).ToList();
                    break;

                case CutTool.nlpir:
                    //NLPIR-ICTCLAS分词
                    tmp = cutByICTCLAS(str);
                    break;

                default:
                    break;
                }
                foreach (var p in tmp)
                {
                    res.Add(new WordPair(p));
                }

                return(res);
            }
            catch
            {
                return(new List <WordPair>());
            }
        }
コード例 #26
0
        private void button2_Click(object sender, EventArgs e)
        {
            lines = File.ReadAllLines(label3.Text);



            var dic = new Dictionary <string, int>();
            int all = lines.Length;
            int cnt = 0;

            posSeg = new PosSegmenter();
            foreach (var l in lines)
            {
                cnt++;
                progressBar1.Value = 50 * cnt / all;
                var tokens = posSeg.Cut(l);
                foreach (var p in tokens)
                {
                    if (p.Flag != "n")
                    {
                        continue;
                    }
                    if (dic.ContainsKey(p.Word))
                    {
                        dic[p.Word] += 1;
                    }
                    else
                    {
                        dic[p.Word] = 1;
                    }
                }
            }

            var arr = new ArrayList();

            foreach (var p in dic)
            {
                arr.Add(new KeyValuePair <int, string>(p.Value, p.Key));
            }

            arr.Sort(new KeyCompare());
            arr.Reverse();
            keyWords = new Dictionary <string, int>();
            for (int i = 0; i < 100; i++)
            {
                var p = (KeyValuePair <int, string>)arr[i];
                keyWords[p.Value] = 1;
            }

            vec = new ArrayList();
            var last = new Dictionary <string, int>();

            foreach (var p in keyWords)
            {
                last[p.Key] = -1000;
            }
            for (int i = 0; i < lines.Length; i++)
            {
                progressBar1.Value = 50 + 50 * (i + 1) / all;
                var tokens = posSeg.Cut(lines[i]);
                foreach (var p in tokens)
                {
                    if (keyWords.ContainsKey(p.Word))
                    {
                        last[p.Word] = i;
                    }
                }
                double   s = 0;
                double[] v = new double[keyWords.Count];
                int      j = 0;
                foreach (var p in keyWords)
                {
                    double t = 1.0 / (i - last[p.Key] + 1);
                    s   += t * t;
                    v[j] = t;
                    j++;
                }
                s = Math.Sqrt(s);
                for (int k = 0; k < v.Length; k++)
                {
                    v[k] /= s;
                }
                vec.Add(v);
            }
            button1_Click(null, null);
        }
コード例 #27
0
    public static List <struCompanyName> GetCompanyNameByCutWord(HTMLEngine.MyRootHtmlNode root)
    {
        var posSeg   = new PosSegmenter();
        var namelist = new List <struCompanyName>();

        foreach (var paragrah in root.Children)
        {
            foreach (var sentence in paragrah.Children)
            {
                if (string.IsNullOrEmpty(sentence.Content))
                {
                    continue;
                }
                var words         = posSeg.Cut(sentence.Content).ToList();
                var PreviewEndIdx = -1;
                for (int baseInd = 0; baseInd < words.Count; baseInd++)
                {
                    var FullName     = String.Empty;
                    var ShortName    = String.Empty;
                    var IsSubCompany = false;
                    if (words[baseInd].Word == "国家电网" &&
                        (baseInd + 1) < words.Count &&
                        words[baseInd + 1].Word == "公司")
                    {
                        namelist.Add(new struCompanyName()
                        {
                            secFullName = "国家电网公司",
                            positionId  = sentence.PositionId,
                            WordIdx     = baseInd,
                            Score       = 100
                        });
                        continue;
                    }
                    if (
                        words[baseInd].Word == "有限公司" ||
                        (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "有限责任") ||
                        (words[baseInd].Word == "公司" && baseInd != 0 && words[baseInd - 1].Word == "承包") ||
                        (words[baseInd].Word == "有限" && baseInd != words.Count - 1 && words[baseInd + 1].Word == "合伙")
                        )
                    {
                        //是否能够在后面找到简称
                        for (int JCIdx = baseInd; JCIdx < words.Count; JCIdx++)
                        {
                            //简称关键字
                            if (words[JCIdx].Word.Equals("简称") || words[JCIdx].Word.Equals("称"))
                            {
                                var ShortNameStart = -1;
                                var ShortNameEnd   = -1;
                                for (int ShortNameIdx = JCIdx; ShortNameIdx < words.Count; ShortNameIdx++)
                                {
                                    if (words[ShortNameIdx].Word.Equals("“"))
                                    {
                                        ShortNameStart = ShortNameIdx + 1;
                                    }
                                    if (words[ShortNameIdx].Word.Equals("”"))
                                    {
                                        ShortNameEnd = ShortNameIdx - 1;
                                        break;
                                    }
                                }
                                if (ShortNameStart != -1 && ShortNameEnd != -1)
                                {
                                    ShortName = String.Empty;
                                    for (int i = ShortNameStart; i <= ShortNameEnd; i++)
                                    {
                                        ShortName += words[i].Word;
                                    }
                                }
                                break;
                            }
                        }

                        var FirstShortNameWord = String.Empty;
                        if (ShortName.Length == 4)
                        {
                            FirstShortNameWord = ShortName.Substring(0, 2);
                        }
                        var IsMarkClosed      = true;
                        var CompanyStartIdx   = -1;
                        var FirstShortNameIdx = -1; //包含简称的位置
                        //是否能够在前面找到地名
                        for (int NRIdx = baseInd; NRIdx > PreviewEndIdx; NRIdx--)
                        {
                            if (words[NRIdx].Word == FirstShortNameWord)
                            {
                                FirstShortNameIdx = NRIdx;   //备用
                            }
                            //寻找地名?words[NRIdx].Flag == EntityWordAnlayzeTool.机构团体
                            //posSeg.Cut(words[NRIdx].Word + "市").First().Flag == EntityWordAnlayzeTool.地名
                            if (words[NRIdx].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx].Word))
                            {
                                //注意,地名可能相连,例如:上海市嘉定
                                if (NRIdx != 0 && (words[NRIdx - 1].Flag == LTPTrainingNER.地名 || PosNS.NsDict.Contains(words[NRIdx - 1].Word)))
                                {
                                    continue;
                                }
                                FullName = String.Empty;
                                for (int companyFullNameInd = NRIdx; companyFullNameInd <= baseInd; companyFullNameInd++)
                                {
                                    FullName += words[companyFullNameInd].Word;
                                }
                                //(有限合伙)
                                if (words[baseInd].Word == "有限")
                                {
                                    FullName += words[baseInd + 1].Word;
                                    if ((baseInd + 2) < words.Count)
                                    {
                                        FullName += words[baseInd + 2].Word;
                                    }
                                }
                                //子公司判断
                                if (NRIdx != 0 && words[NRIdx - 1].Word == "子公司")
                                {
                                    IsSubCompany = true;
                                }
                                if (IsMarkClosed)
                                {
                                    //皆大欢喜的局面
                                    CompanyStartIdx = NRIdx;
                                    PreviewEndIdx   = baseInd;
                                    break;  //不要继续寻找地名了
                                }
                            }
                            if (words[NRIdx].Flag == LTPTrainingNER.词性标点)
                            {
                                if (words[NRIdx].Word != "(" && words[NRIdx].Word != ")")
                                {
                                    break;
                                }
                                if (words[NRIdx].Word == ")")
                                {
                                    IsMarkClosed = false;                              //打开
                                }
                                if (words[NRIdx].Word == "(")
                                {
                                    IsMarkClosed = true;                               //关闭
                                }
                            }
                        }

                        if (CompanyStartIdx == -1)
                        {
                            if (FirstShortNameIdx == -1)
                            {
                                continue;
                            }
                            if (posSeg.Cut(ShortName).First().Flag == LTPTrainingNER.地名)
                            {
                                continue;
                            }
                            FullName = String.Empty;
                            for (int NRIdx = FirstShortNameIdx; NRIdx <= baseInd; NRIdx++)
                            {
                                FullName += words[NRIdx].Word;
                            }
                            //(有限合伙)
                            if (words[baseInd].Word == "有限")
                            {
                                FullName += words[baseInd + 1].Word;
                                FullName += words[baseInd + 2].Word;
                            }
                            //子公司判断
                            if (FirstShortNameIdx != 0 && words[FirstShortNameIdx - 1].Word == "子公司")
                            {
                                IsSubCompany = true;
                            }
                        }

                        if (FullName != String.Empty)
                        {
                            FullName  = FullName.Replace(" ", String.Empty).Trim();
                            ShortName = ShortName.Replace(" ", String.Empty).Trim();
                            if (ShortName == "公司" || ShortName == "本公司")
                            {
                                ShortName = String.Empty;
                            }
                            if (ShortName == String.Empty)
                            {
                                var json = GetCompanyNameByFullName(FullName);
                                ShortName = json.secShortName;
                            }
                            namelist.Add(new struCompanyName()
                            {
                                secFullName  = FullName,
                                secShortName = ShortName,
                                isSubCompany = IsSubCompany,
                                positionId   = sentence.PositionId,
                                WordIdx      = CompanyStartIdx,
                                Score        = 100
                            });
                        }
                    }
                }
            }
        }
        return(namelist);
    }
コード例 #28
0
        /// <summary>
        /// 词性标注并写入文件
        /// </summary>
        /// <param name="sentences"></param>
        public static void PosSegmentWriteToFile(List <string> sentences, string path)
        {
            JiebaSegmenter jbSeg  = new JiebaSegmenter();
            PosSegmenter   posSeg = new PosSegmenter(jbSeg);
            StreamWriter   writer = File.CreateText(path);

            foreach (var sentence in sentences)
            {
                IEnumerable <Pair> wordList = posSeg.Cut(sentence);
                foreach (var item in wordList)
                {
                    //nr 人名 | nr1 汉语姓氏 | nr2 汉语名字 | nrj 日语人名 | nrf 音译人名
                    if (item.Flag.StartsWith("nr"))//人名
                    {
                        writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Person");
                        for (int i = 1; i < item.Word.Length; i++)
                        {
                            writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Person");
                        }
                    }
                    else if (item.Flag.Equals("ns"))//地名
                    {
                        writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Location");
                        for (int i = 1; i < item.Word.Length; i++)
                        {
                            writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Location");
                        }
                    }
                    else if (item.Flag.Equals("nt"))//机构名
                    {
                        writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Organization");
                        for (int i = 1; i < item.Word.Length; i++)
                        {
                            writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Organization");
                        }
                    }
                    else if (item.Flag.Equals("t"))//时间
                    {
                        writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Time");
                        for (int i = 1; i < item.Word.Length; i++)
                        {
                            writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Time");
                        }
                    }
                    else if (item.Flag.Equals("v"))//动词
                    {
                        writer.WriteLine(item.Word[0] + "\t" + item.Flag + "\t" + "B-Event");
                        for (int i = 1; i < item.Word.Length; i++)
                        {
                            writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "I-Event");
                        }
                    }
                    else//非实体
                    {
                        for (int i = 0; i < item.Word.Length; i++)
                        {
                            writer.WriteLine(item.Word[i] + "\t" + item.Flag + "\t" + "O");
                        }
                    }
                }
            }
            writer.WriteLine();
            writer.Close();
        }
コード例 #29
0
 private IEnumerable <string> FilterCutByPos(string text, IEnumerable <string> allowPos) =>
 PosSegmenter.Cut(text).Where(p => allowPos.Contains(p.Flag)).Select(p => p.Word);
コード例 #30
0
ファイル: TfidfExtractor.cs プロジェクト: zhangkangen/Learn
        private IEnumerable <string> FilterCutByPos(string text, IEnumerable <string> allowPos)
        {
            var posTags = PosSegmenter.Cut(text).Where(p => allowPos.Contains(p.Flag));

            return(posTags.Select(p => p.Word));
        }