Exemple #1
0
        public override CoNLLSentence Parse(List <Term> terms)
        {
            var table = new Table();

            table.v = new string[terms.Count][];
            for (int i = 0; i < terms.Count; i++)
            {
                var term = terms[i];
                var line = new string[4];
                table.v[i] = line;
                line[0]    = term.word;
                line[2]    = DependencyUtil.compilePOS(term.nature);
                line[1]    = line[2].Substring(0, 1);
            }
            _crfModel.Tag(table);

            var words = new CoNLLWord[table.Size];

            for (int i = 0; i < words.Length; i++)
            {
                words[i] = new CoNLLWord(i + 1, table.v[i][0], table.v[i][2], table.v[i][1]);
            }

            for (int i = 0; i < table.Size; i++)
            {
                var line = table.v[i];
                var dtag = new DTag(line[3]);
                if (dtag.pos.EndsWith("ROOT"))
                {
                    words[i].HEAD = CoNLLWord.ROOT;
                }
                else
                {
                    var index = ConvertOffset2Index(dtag, table, i);
                    if (index == -1)
                    {
                        words[i].HEAD = CoNLLWord.NULL;
                    }
                    else
                    {
                        words[i].HEAD = words[index];
                    }
                }
            }

            for (int i = 0; i < words.Length; i++)
            {
                words[i].DEPREL = BigramDependencyModel.Get(words[i].NAME, words[i].POSTAG, words[i].HEAD.NAME, words[i].HEAD.POSTAG);
            }

            return(new CoNLLSentence(words));
        }
Exemple #2
0
        public override List <Term> SegSentence(char[] sentence)
        {
            var list = new List <Term>();

            if (sentence.Length == 0)
            {
                return(list);
            }

            var convertedChars = CharTable.Convert(sentence);
            var table          = new Table();

            table.v = AtomSeg2Table(convertedChars);
            _crfModel.Tag(table);
            int offset = 0;

            for (int i = 0; i < table.Size; offset += table.v[i][1].Length, i++)
            {
                var line = table.v[i];
                switch (line[2][0])
                {
                case 'B':
                    int begin = offset;
                    while (table.v[i][2][0] != 'E')             // 寻找结束标签'E'
                    {
                        offset += table.v[i][1].Length;
                        i++;
                        if (i == table.Size)
                        {
                            break;                          // 达到最后一个字符
                        }
                    }
                    // 退出while循环
                    if (i == table.Size)            // 肯定是由while loop的break退出的,offset已经包含了最后一格词的长度
                    {
                        list.Add(new Term(new string(sentence, begin, offset - begin), Nature.none));
                    }
                    else                            // 由while loop正常退出,当前词标注为'E',offset尚未包含这个词的长度
                    {
                        list.Add(new Term(new string(sentence, begin, offset - begin + table.v[i][1].Length), Nature.none));
                    }

                    break;

                default:            // 理论来说,只可能是标注为'S',所以单独成词
                    list.Add(new Term(new string(sentence, offset, table.v[i][1].Length), Nature.none));
                    break;
                }
            }

            if (config.natureTagging)
            {
                var vertices = ToVertexList(list, true);
                Viterbi.Compute(vertices, CoreDictTransfromMatrixDictionary.transformMatrixDictionary);
                for (int i = 0; i < list.Count; i++)
                {
                    var term = list[i];
                    if (term.nature == Nature.none)
                    {
                        term.nature = vertices[i + 1].GuessNature();            // vertices[i+1] -> 附加了辅助起始节点
                    }
                }
            }
            if (config.useCustomDict)
            {
                var vertices = ToVertexList(list, false);       //? 会不会覆盖上面的词性标注值
                CombineByCustomDict(vertices);
                list = ToTermList(vertices, config.offset);
            }
            return(list);
        }