/**
         * 执行识别
         *
         * @param segResult      粗分结果
         * @param wordNetOptimum 粗分结果对应的词图
         * @param wordNetAll     全词图
         */
        public static void Recognition(LinkedList <Vertex> segResult, WordNet wordNetOptimum, WordNet wordNetAll)
        {
            StringBuilder sbName      = new StringBuilder();
            int           appendTimes = 0;

            char[] charArray = wordNetAll.charArray;
            BaseSearcher <KeyValuePair <String, char> > searcher = JapanesePersonDictionary.getSearcher(charArray);
            KeyValuePair <String, char> entry;
            int activeLine = 1;
            int preOffset  = 0;
            //while ((entry = searcher.next()) != null)
            //{
            //    char label = entry.Value;
            //    String key = entry.Key;
            //    int offset = searcher.getOffset();
            //    if (preOffset != offset)
            //    {
            //        if (appendTimes > 1 && sbName.Length > 2) // 日本人名最短为3字
            //        {
            //            insertName(sbName.ToString(), activeLine, wordNetOptimum, wordNetAll);
            //        }
            //        sbName.Length = 0;
            //        appendTimes = 0;
            //    }
            //    if (appendTimes == 0)
            //    {
            //        if (label == JapanesePersonDictionary.X)
            //        {
            //            sbName.Append(key);
            //            ++appendTimes;
            //            activeLine = offset + 1;
            //        }
            //    }
            //    else
            //    {
            //        if (label == JapanesePersonDictionary.M)
            //        {
            //            sbName.Append(key);
            //            ++appendTimes;
            //        }
            //        else
            //        {
            //            if (appendTimes > 1 && sbName.Length > 2)
            //            {
            //                insertName(sbName.ToString(), activeLine, wordNetOptimum, wordNetAll);
            //            }
            //            sbName.Length = 0;
            //            appendTimes = 0;
            //        }
            //    }
            //    preOffset = offset + key.Length;
            //}
            //if (sbName.Length > 0)
            //{
            //    if (appendTimes > 1)
            //    {
            //        insertName(sbName.ToString(), activeLine, wordNetOptimum, wordNetAll);
            //    }
            //}
        }
Esempio n. 2
0
        public static void PatternMatch(List <NS> nss, List <Vertex> vertices, WordNet wordnet_op, WordNet wordnet_all)
        {
            var sb = new StringBuilder(nss.Count);

            for (int i = 0; i < nss.Count; i++)
            {
                sb.Append(nss[i].ToString());
            }

            var patternStr = sb.ToString();
            var vertexArr  = vertices.ToArray();

            trie.Match(patternStr, (begin, end, keyword) =>
            {
                var sbName = new StringBuilder();
                for (int i = begin; i < end; i++)
                {
                    sbName.Append(vertexArr[i].realWord);
                }
                var name = sbName.ToString();
                if (IsBadCase(name))
                {
                    return;
                }
                int offset = 0;
                for (int i = 0; i < begin; i++)
                {
                    offset += vertexArr[i].realWord.Length;
                }

                wordnet_op.Insert(offset, new Vertex(TAG_PLACE, name, ATTRIBUTE, CoreDictionary.NS_WORD_ID), wordnet_all);
            });
        }
        private static List <TagFreqItem <NS> > RoleTag(List <Vertex> vertices, WordNet wordnet_all)
        {
            var tagList = new List <TagFreqItem <NS> >();

            for (int i = 0; i < vertices.Count; i++)
            {
                var vertex = vertices[i];

                var nature = vertex.GetNature();
                if (Nature.ns == nature && vertex.attr.totalFreq <= 1000)
                {
                    if (vertex.realWord.Length < 3)     // 二字地名
                    {
                        tagList.Add(new TagFreqItem <NS>(NS.H, NS.G));
                    }
                    else
                    {
                        tagList.Add(new TagFreqItem <NS>(NS.G));
                    }
                    continue;
                }
                var tfi = PlaceDictionary.dict.Get(vertex.word);        // 使用等效词
                if (tfi == null)
                {
                    tfi = new TagFreqItem <NS>(NS.Z, PlaceDictionary.trans_tr_dict.GetFreq(NS.Z));
                }
                tagList.Add(tfi);
            }
            return(tagList);
        }
Esempio n. 4
0
        /// <summary>
        /// 使用用户词典合并粗分结果,并将结果收集到全词图中,然后返回合并后的结果
        /// In place modify
        /// </summary>
        /// <param name="vertices"></param>
        /// <param name="wordNet"></param>
        /// <returns></returns>
        protected static List <Vertex> CombineByCustomDict(List <Vertex> vertices, WordNet wordNet)
        {
            var list = CombineByCustomDict(vertices);       // 合并,产生 最长匹配
            int line = 0;                                   // 行号

            //! 索引
            for (int i = 0; i < list.Count; i++)
            {
                var vertex    = list[i];                    // 当前词条
                var parentLen = vertex.realWord.Length;     // 当前词条的字符串长度
                int currLine  = line;                       // 获取当前行号
                if (parentLen >= 3)                         // 长词条,
                {
                    Action <int, int, WordAttr> action = (begin, end, value) =>
                    {
                        if (end - begin == parentLen)
                        {
                            return;
                        }
                        wordNet.Add(currLine + begin, new Vertex(vertex.realWord.Substring(begin, end), value));
                    };
                    CustomDictionary.Parse(vertex.realWord, action);
                }
                line += parentLen;
            }
            return(list);
        }
Esempio n. 5
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="cur"></param>
        /// <param name="wordNet">词网</param>
        /// <param name="line">起始位置的行号,数量词的首字符所在行号</param>
        /// <param name="length">数量词(连续数词)合并后的整体长度</param>
        private static void RemoveFromWordNet(Vertex cur, WordNet wordNet, int line, int length)
        {
            var vertexes = wordNet.Vertices[line + length];         // 紧邻数量词(连续数词)之后的行号

            for (int i = 0; i < vertexes.Count; i++)                // 遍历这一行上的所有节点
            {
                if (vertexes[i].from == cur)                        // 如果某个节点的前驱节点是原量词(原数词),现在由于量词或数词被合并进数量词(连续数词),所以将前驱节点置空
                {
                    vertexes[i].from = null;
                }
            }

            var vertexes1 = wordNet.Vertices[line + length - cur.realWord.Length];      // 获取原始量词所在的行号上的所有节点列表
            var removeIdx = -1;

            for (int i = 0; i < vertexes1.Count; i++)                                    //
            {
                if (vertexes1[i] == cur)                                                 // 如果发现原始量词,由于这个量词被合并进入数量词,所以需要移除这个量词
                {
                    removeIdx = i;
                    break;
                }
            }

            if (removeIdx >= 0)                                     // 执行移除操作
            {
                vertexes1.RemoveAt(removeIdx);
            }
        }
Esempio n. 6
0
        public static void PatternMatch(List <NT> nts, List <Vertex> vertices, WordNet wordNetOptimum, WordNet wordNetAll)
        {
            var sb = new StringBuilder(nts.Count);

            for (int i = 0; i < nts.Count; i++)
            {
                sb.Append(nts[i].ToString());
            }

            var patternStr = sb.ToString();
            var vertexArr  = vertices.ToArray();

            _trie.Match(patternStr, (begin, end, keyword) =>
            {
                var sbName = new StringBuilder();
                for (int i = begin; i < end; i++)
                {
                    sbName.Append(vertexArr[i].realWord);
                }

                var name = sbName.ToString();
                if (IsBadCase(name))
                {
                    return;                         // 对一些basecase 做出调整
                }
                int offset = 0;
                for (int i = 0; i < begin; i++)
                {
                    offset += vertexArr[i].realWord.Length;
                }

                wordNetOptimum.Insert(offset, new Vertex(TAG_GROUP, name, ATTRIBUTE, CoreDictionary.NT_WORD_ID), wordNetAll);
            });
        }
        /**
         * 对粗分结果执行一些规则上的合并拆分等等,同时合成新词网
         *
         * @param linkedArray    粗分结果
         * @param wordNetOptimum 合并了所有粗分结果的词网
         */
        protected static void GenerateWord(LinkedList <Vertex> linkedArray, WordNet wordNetOptimum)
        {
            fixResultByRule(linkedArray);

            //--------------------------------------------------------------------
            // 建造新词网
            wordNetOptimum.addAll(linkedArray);
        }
        public static bool Recognition(List <Vertex> vertices, WordNet wordnet_op, WordNet wordnet_all)
        {
            var tagItems = RoleTag(vertices, wordnet_all);
            var nss      = ViterbiExCompute(tagItems);

            PlaceDictionary.PatternMatch(nss, vertices, wordnet_op, wordnet_all);
            return(true);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="wordSegs">句子分词后的词条列表</param>
        /// <param name="wordNetOptimum"></param>
        /// <param name="wordNetAll"></param>
        /// <returns></returns>
        public static bool Recognition(List <Vertex> wordSegs, WordNet wordNetOptimum, WordNet wordNetAll)
        {
            var tags = RoleObserve(wordSegs);
            var nrs  = ViterbiComputeSimply(tags);

            ChsPersonNameDict.PatternMatch(nrs, wordSegs, wordNetOptimum, wordNetAll);
            return(true);
        }
 /**
  * 插入日本人名
  * @param name
  * @param activeLine
  * @param wordNetOptimum
  * @param wordNetAll
  */
 private static void insertName(String name, int activeLine, WordNet wordNetOptimum, WordNet wordNetAll)
 {
     if (isBadCase(name))
     {
         return;
     }
     wordNetOptimum.insert(activeLine, new Vertex(Predefine.TAG_PEOPLE, name, new CoreDictionary.Attribute(Nature.nrj), NRConstant.WORD_ID), wordNetAll);
 }
Esempio n. 11
0
        public static bool Recognition(List <Vertex> vertices, WordNet wordNetOptimum, WordNet wordNetAll)
        {
            var tagItems = RoleTag(vertices, wordNetAll);
            var nts      = ViterbiExCompute(tagItems);

            OrgDictionary.PatternMatch(nts, vertices, wordNetOptimum, wordNetAll);
            return(true);
        }
Esempio n. 12
0
        /**
         * 合并数字
         * @param termList
         */
        protected void mergeNumberQuantifier(LinkedList <Vertex> termList, WordNet wordNetAll, Config config)
        {
            if (termList.Count < 4)
            {
                return;
            }
            StringBuilder sbQuantifier = new StringBuilder();

            LinkedList <Vertex> .Enumerator iterator = termList.GetEnumerator();
            //iterator.next();
            int line = 1;

            while (iterator.MoveNext())
            {
                Vertex pre = iterator.Current;
                if (pre.hasNature(Nature.m))
                {
                    sbQuantifier.Append(pre.realWord);
                    Vertex cur = null;
                    while (iterator.MoveNext() && (cur = iterator.Current).hasNature(Nature.m))
                    {
                        sbQuantifier.Append(cur.realWord);
                        //iterator.remove();
                        removeFromWordNet(cur, wordNetAll, line, sbQuantifier.Length);
                    }
                    if (cur != null)
                    {
                        if ((cur.hasNature(Nature.q) || cur.hasNature(Nature.qv) || cur.hasNature(Nature.qt)))
                        {
                            if (config.indexMode)
                            {
                                wordNetAll.add(line, new Vertex(sbQuantifier.ToString(), new CoreDictionary.Attribute(Nature.m)));
                            }
                            sbQuantifier.Append(cur.realWord);
                            //iterator.remove();
                            removeFromWordNet(cur, wordNetAll, line, sbQuantifier.Length);
                        }
                        else
                        {
                            line += cur.realWord.Length;   // (cur = iterator.next()).hasNature(Nature.m) 最后一个next可能不含q词性
                        }
                    }
                    if (sbQuantifier.Length != pre.realWord.Length)
                    {
                        pre.realWord        = sbQuantifier.ToString();
                        pre.word            = Predefine.TAG_NUMBER;
                        pre.attribute       = new CoreDictionary.Attribute(Nature.mq);
                        pre.wordID          = CoreDictionary.M_WORD_ID;
                        sbQuantifier.Length = 0;
                    }
                }
                sbQuantifier.Length = 0;
                line += pre.realWord.Length;
            }
            //        System.out.println(wordNetAll);
        }
        /**
         * 执行识别
         * @param segResult 粗分结果
         * @param wordNetOptimum 粗分结果对应的词图
         * @param wordNetAll 全词图
         */
        public static void Recognition(LinkedList <Vertex> segResult, WordNet wordNetOptimum, WordNet wordNetAll)
        {
            StringBuilder sbName      = new StringBuilder();
            int           appendTimes = 0;

            LinkedList <Vertex> .Enumerator listIterator = segResult.GetEnumerator();
            listIterator.MoveNext();
            int line       = 1;
            int activeLine = 1;

            while (listIterator.MoveNext())
            {
                Vertex vertex = listIterator.Current;
                if (appendTimes > 0)
                {
                    if (vertex.guessNature() == Nature.nrf || TranslatedPersonDictionary.containsKey(vertex.realWord))
                    {
                        sbName.Append(vertex.realWord);
                        ++appendTimes;
                    }
                    else
                    {
                        // 识别结束
                        if (appendTimes > 1)
                        {
                            if (HanLP.Config.DEBUG)
                            {
                                //System.out.println("音译人名识别出:" + sbName.ToString());
                            }
                            wordNetOptimum.insert(activeLine, new Vertex(Predefine.TAG_PEOPLE, sbName.ToString(), new CoreDictionary.Attribute(Nature.nrf), NRConstant.WORD_ID), wordNetAll);
                        }
                        sbName.Length = 0;
                        appendTimes   = 0;
                    }
                }
                else
                {
                    // nrf和nsf触发识别
                    if (vertex.guessNature() == Nature.nrf || vertex.getNature() == Nature.nsf
                        //                        || TranslatedPersonDictionary.containsKey(vertex.realWord)
                        )
                    {
                        sbName.Append(vertex.realWord);
                        ++appendTimes;
                        activeLine = line;
                    }
                }

                line += vertex.realWord.Length;
            }
        }
Esempio n. 14
0
 /// <summary>
 /// Initialize SUMO (This usually takes some time)
 /// </summary>
 /// <param name="baseDirectory">The directory path which contains the 'KBs' directory</param>
 public static void Init(string baseDirectory)
 {
     if (!_initialized)
     {
         if (Directory.Exists(baseDirectory))
         {
             Environment.SetEnvironmentVariable("SIGMA_HOME", baseDirectory);
             _mgr.initializeOnce();
             WordNet.Init(); // Initialize the wordnet mappings too !
             _initialized = true;
         }
         else
         {
             throw new DirectoryNotFoundException("Cannot find " + baseDirectory);
         }
     }
 }
Esempio n. 15
0
 /**
  * 将一个词语从词网中彻底抹除
  * @param cur 词语
  * @param wordNetAll 词网
  * @param line 当前扫描的行数
  * @param length 当前缓冲区的长度
  */
 private static void removeFromWordNet(Vertex cur, WordNet wordNetAll, int line, int length)
 {
     LinkedList <Vertex>[] vertexes = wordNetAll.getVertexes();
     // 将其从wordNet中删除
     foreach (Vertex vertex in vertexes[line + length])
     {
         if (vertex.from == cur)
         {
             vertex.from = null;
         }
     }
     LinkedList <Vertex> .Enumerator iterator = vertexes[line + length - cur.realWord.Length].GetEnumerator();
     while (iterator.MoveNext())
     {
         Vertex vertex = iterator.Current;
         //if (vertex == cur) iterator.remove();
     }
 }
Esempio n. 16
0
        /// <summary>
        /// 找出给定顶点列表中的顶点的关联词性标签,以及对应在机构词典中的《标签,频率》pair。
        /// </summary>
        /// <param name="vertices"></param>
        /// <param name="wordNetAll"></param>
        /// <returns></returns>
        public static List <TagFreqItem <NT> > RoleTag(List <Vertex> vertices, WordNet wordNetAll)
        {
            var tagList = new List <TagFreqItem <NT> >();

            for (int i = 0; i < vertices.Count; i++)        // 遍历顶点
            {
                var vertex = vertices[i];                   // 当前顶点

                // 找出当前词条的所有关联词性,并作为
                var nature = vertex.GetNature();            // 当前顶点(词条)的词性
                switch (nature)
                {
                case Nature.nrf:                            // 音译人名
                    if (vertex.attr.totalFreq <= 1000)
                    {
                        tagList.Add(new TagFreqItem <NT>(NT.F, 1000));
                        continue;
                    }
                    break;

                case Nature.ni:                             // 机构相关名称
                case Nature.nic:
                case Nature.nis:
                case Nature.nit:
                    var tfi = new TagFreqItem <NT>(NT.K, 1000);             //
                    tfi.AddLabel(NT.D, 1000);
                    tagList.Add(tfi);
                    continue;

                case Nature.m:
                    tagList.Add(new TagFreqItem <NT>(NT.M, 1000));
                    continue;
                }

                var tagItem = OrgDictionary.dictionary.Get(vertex.word);        // 此处使用等效词,更加精准
                if (tagItem == null)
                {
                    tagItem = new TagFreqItem <NT>(NT.Z, OrgDictionary.transformMatrixDictionary.GetFreq(NT.Z));
                }

                tagList.Add(tagItem);
            }
            return(tagList);
        }
Esempio n. 17
0
        public static bool Recognition(LinkedList <Vertex> pWordSegResult, WordNet wordNetOptimum, WordNet wordNetAll)
        {
            LinkedList <EnumItem <Corpus.Tag.NT> > roleTagList = roleTag(pWordSegResult, wordNetAll);

            if (HanLP.Config.DEBUG)
            {
                StringBuilder sbLog = new StringBuilder();
                //Iterator<Vertex> iterator = pWordSegResult.iterator();
                foreach (EnumItem <Corpus.Tag.NT> NTEnumItem in roleTagList)
                {
                    sbLog.Append('[');
                    //sbLog.Append(iterator.next().realWord);
                    sbLog.Append(' ');
                    sbLog.Append(NTEnumItem);
                    sbLog.Append(']');
                }
                Console.WriteLine("机构名角色观察:%s\n", sbLog.ToString());
            }
            LinkedList <Corpus.Tag.NT> NTList = viterbiExCompute(roleTagList);

            if (HanLP.Config.DEBUG)
            {
                StringBuilder sbLog = new StringBuilder();
                //Iterator<Vertex> iterator = pWordSegResult.iterator();
                sbLog.Append('[');
                foreach (Corpus.Tag.NT NT in NTList)
                {
                    //sbLog.Append(iterator.next().realWord);
                    sbLog.Append('/');
                    sbLog.Append(NT);
                    sbLog.Append(" ,");
                }
                if (sbLog.Length > 1)
                {
                    sbLog.Remove(sbLog.Length - 2, sbLog.Length);
                }
                sbLog.Append(']');
                Console.WriteLine("机构名角色标注:%s\n", sbLog.ToString());
            }

            OrganizationDictionary.parsePattern(NTList, pWordSegResult, wordNetOptimum, wordNetAll);
            return(true);
        }
Esempio n. 18
0
        public static void Recognition(List <Vertex> vertices, WordNet wordNetOptimum, WordNet wordNetAll)
        {
            var sb          = new StringBuilder();
            int appendTimes = 0;                        // stringbuilder 附加次数

            int line       = 1;                         // 行号,跳过起始辅助节点
            int activeLine = 1;                         //

            for (int i = 1; i < vertices.Count; i++)    // 遍历节点,跳过起始辅助节点
            {
                var vertex = vertices[i];
                if (appendTimes > 0)                     // 已经附加过
                {
                    // 如果顶点词性为音译人名,或者音译人名词典包含顶点字符串值
                    if (vertex.GuessNature() == Nature.nrf || TranslatedPersonDictionary.ContainsKey(vertex.realWord))
                    {
                        sb.Append(vertex.realWord);
                        ++appendTimes;
                    }
                    else
                    {
                        // 识别结束
                        if (appendTimes > 1)             // 附加两次才算是一个完整的音译人名吗?
                        {
                            wordNetOptimum.Insert(activeLine, new Vertex(TAG_PEOPLE, sb.ToString(), new WordAttr(Nature.nrf), CoreDictionary.NR_WORD_ID), wordNetAll);
                        }
                        sb.Clear();
                        appendTimes = 0;
                    }
                }
                else                                    // 尚未附加过
                {
                    if (vertex.GuessNature() == Nature.nrf || TranslatedPersonDictionary.ContainsKey(vertex.realWord))
                    {
                        sb.Append(vertex.realWord);
                        ++appendTimes;
                        activeLine = line;              // 第一次附加,记录活跃行号
                    }
                }

                line += vertex.realWord.Length;         // 更新下一个顶点的行号
            }
        }
        /**
         * 生成一元词网
         *
         * @param wordNetStorage
         */
        protected void GenerateWordNet(WordNet wordNetStorage)
        {
            char[] charArray = wordNetStorage.charArray;

            // 核心词典查询
            DoubleArrayTrie <CoreDictionary.Attribute> .Searcher searcher = CoreDictionary.trie.getSearcher(charArray, 0);
            while (searcher.next())
            {
                wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value, searcher.index));
            }
            // 用户词典查询
            //        if (config.useCustomDictionary)
            //        {
            //            searcher = CustomDictionary.dat.getSearcher(charArray, 0);
            //            while (searcher.next())
            //            {
            //                wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value));
            //            }
            //        }
            // 原子分词,保证图连通
            //List<Vertex>[] vertexes = wordNetStorage.getVertexes();
            List <Vertex>[] vertexes = wordNetStorage.getVertexes();
            for (int i = 1; i < vertexes.Length;)
            {
                if (vertexes[i].Count == 0)
                {
                    int j = i + 1;
                    for (; j < vertexes.Length - 1; ++j)
                    {
                        if (!(vertexes[j].Count == 0))
                        {
                            break;
                        }
                    }
                    wordNetStorage.add(i, quickAtomSegment(charArray, i - 1, j - 1));
                    i = j;
                }
                else
                {
                    i += vertexes[i][vertexes[i].Count - 1].realWord.Length;
                }
            }
        }
Esempio n. 20
0
        /**
         * <summary>Returns SynSets with the synonym interlingual dependencies.</summary>
         *
         * <param name="secondLanguage">WordNet in other language to find relations</param>
         * <returns>a list of SynSets that has interlingual relations in it</returns>
         */
        public List <SynSet> GetInterlingual(WordNet secondLanguage)
        {
            var result = new List <SynSet>();

            foreach (var t in _relations)
            {
                if (t is InterlingualRelation relation)
                {
                    if (relation.GetType() == InterlingualDependencyType.SYNONYM)
                    {
                        var second = secondLanguage.GetSynSetWithId(relation.GetName());
                        if (second != null)
                        {
                            result.Add(second);
                        }
                    }
                }
            }

            return(result);
        }
Esempio n. 21
0
        /// <summary>
        /// Viterbi
        /// 其实不是分词,而是从词网中选择一条概率最大的路径
        /// </summary>
        /// <param name="wordNet"></param>
        /// <returns></returns>
        private static List <Vertex> Viterbi(WordNet wordNet)
        {
            var lines = wordNet.Vertices;
            var list  = new List <Vertex>();

            foreach (var node in lines[1])           // 遍历第一行节点列表
            {
                node.UpdateMyFrom(lines[0].First()); // 第一行每个节点更新,设置第0 行节点(起始辅助节点)为其前驱
            }
            //! 需要注意的是,UpdateFrom方法更新前驱节点的时候使用了Viterbi方法根据联合概率最大来跟新的
            for (int i = 1; i < lines.Length - 1; i++)
            {
                var line = lines[i];
                if (line == null || line.Count == 0)
                {
                    continue;                                      // 空行跳过
                }
                foreach (var node in line)
                {
                    if (node.from == null)
                    {
                        continue;
                    }
                    foreach (var to in lines[i + node.realWord.Length])     // 遍历当前节点的下一行的各节点,注意当前行不同长度的节点对应的下一行也不同
                    {
                        to.UpdateMyFrom(node);                              // 下一行各节点以当前节点为前驱节点
                    }
                }
            }
            var from = lines[lines.Length - 1].First(); // 最后一行,仅有一个辅助节点,其前驱节点为原始句子的最后一个词

            while (from != null)                        // 从最后一个辅助节点开始,根据前驱节点倒推得到所有节点
            {
                list.Add(from);
                from = from.from;
            }
            list.Reverse();         // 反转,得到顺序
            return(list);           // 得到词列表,包括首尾辅助节点
        }
Esempio n. 22
0
        private static LinkedList <Vertex> viterbi(WordNet wordNet)
        {
            // 避免生成对象,优化速度
            LinkedList <Vertex>[] nodes      = wordNet.getVertexes();
            LinkedList <Vertex>   vertexList = new LinkedList <Vertex>();

            foreach (Vertex node in nodes[1])
            {
                node.updateFrom(nodes[0].First());
            }
            for (int i = 1; i < nodes.Length - 1; ++i)
            {
                LinkedList <Vertex> nodeArray = nodes[i];
                if (nodeArray == null)
                {
                    continue;
                }
                foreach (Vertex node in nodeArray)
                {
                    if (node.from == null)
                    {
                        continue;
                    }
                    foreach (Vertex to in nodes[i + node.realWord.Length])
                    {
                        to.updateFrom(node);
                    }
                }
            }
            Vertex from = nodes[nodes.Length - 1].First();

            while (from != null)
            {
                vertexList.AddFirst(from);
                from = from.from;
            }
            return(vertexList);
        }
        /// <summary>
        /// 生成一元词网
        /// </summary>
        /// <param name="wordNet">初创词网对象</param>
        protected void GenerateWordNet(WordNet wordNet)
        {
            var chars = wordNet.charArr;                               // 原始句子的字符数组

            var searcher = CoreDictionary._trie.GetSearcher(chars, 0); // 获取核心词典词语搜索器:搜索核心词典中的词条

            while (searcher.Next())
            {
                // searcher.begin + 1 -> 由于存在起始辅助节点,所以每个节点的索引向后偏移 1 。
                wordNet.Add(searcher.begin + 1, new Vertex(new string(chars, searcher.begin, searcher.length), searcher.value, searcher.index));
            }

            var vertices = wordNet.Vertices;

            // 上一步中,仅仅是根据核心词典中的词条来划分原始句子,所以还需要对句子中除了核心词汇之外的部分进行分词
            // 比如“这是一个操蛋的世界”,假设核心词汇为“一个”、“世界”,那显然剩余部分——“这是”、“操蛋的”——还需要进行分词处理
            // 然而,我们这里先使用快速原子分词,后面再进一步使用专用分词器分词
            for (int i = 1; i < vertices.Length;)
            {
                if (vertices[i].Count == 0)     // 如果当前行没有顶点
                {
                    int j = i + 1;              // 往后遍历,找到下一个存在节点的行
                    for (; j < vertices.Length - 1; j++)
                    {
                        if (vertices[j].Count != 0)
                        {
                            break;
                        }
                    }
                    wordNet.Add(i, QuickAtomSegment(chars, i - 1, j - 1)); // i-1,j-1,这里因为有起始辅助节点,导致下标 i,j 均向后偏移一位,所以需要减去 1。
                    i = j;                                                 // 更新当前位置到下一个有节点的行号位置
                }
                else
                {
                    i += vertices[i].Last().realWord.Length;
                }
            }
        }
 private void ProcessGraphInputFilesBgWorkerWork(object sender, DoWorkEventArgs e)
 {
     _workWatch = Stopwatch.StartNew();
     _wordNet   = new WordNet(HypernymFile, SynsetFile, ProcessGraphInputFilesWordNetOnProgressGraphReport);
     _workWatch.Stop();
 }
Esempio n. 25
0
        /**
         * 模式匹配
         *
         * @param nsList         确定的标注序列
         * @param vertexList     原始的未加角色标注的序列
         * @param wordNetOptimum 待优化的图
         * @param wordNetAll
         */
        public static void parsePattern(List <Corpus.Tag.NS> nsList, LinkedList <Vertex> vertexList, WordNet wordNetOptimum, WordNet wordNetAll)
        {
            //        ListIterator<Vertex> listIterator = vertexList.listIterator();
            StringBuilder sbPattern = new StringBuilder(nsList.Count);

            foreach (Corpus.Tag.NS ns in nsList)
            {
                sbPattern.Append(ns.ToString());
            }
            String pattern = sbPattern.ToString();
            //Vertex[] wordArray = vertexList.toArray(new Vertex[0]);
            //            Vertex[] wordArray = vertexList.ToArray();
            //            trie.parseText(pattern, new AhoCorasickDoubleArrayTrie.IHit<String>()
            //        {

            //            public void hit(int begin, int end, String value)
            //    {
            //        StringBuilder sbName = new StringBuilder();
            //        for (int i = begin; i < end; ++i)
            //        {
            //            sbName.Append(wordArray[i].realWord);
            //        }
            //        String name = sbName.ToString();
            //        // 对一些bad case做出调整
            //        if (isBadCase(name)) return;

            //        // 正式算它是一个名字
            //        if (HanLP.Config.DEBUG)
            //        {
            //            Console.WriteLine("识别出地名:%s %s\n", name, value);
            //        }
            //        int offset = 0;
            //        for (int i = 0; i < begin; ++i)
            //        {
            //            offset += wordArray[i].realWord.length();
            //        }
            //        wordNetOptimum.insert(offset, new Vertex(Predefine.TAG_PLACE, name, ATTRIBUTE, WORD_ID), wordNetAll);
            //    }
            //});
        }
Esempio n. 26
0
 /// <summary>
 /// Initializes the WordNet provider.
 /// </summary>
 /// <param name="wordnet">The WordNet.</param>
 public void Initialize(WordNet wordnet)
 {
     wordNet = wordnet;
 }
Esempio n. 27
0
        /// <summary>
        /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset
        /// members initialized. These members are enough to look up the full synset within the corresponding data file. This
        /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the
        /// corresponding parameter.
        /// </summary>
        /// <param name="wordIndexLine">Word index line from which to get synset shells</param>
        /// <param name="pos">POS of the given index line</param>
        /// <param name="mostCommonSynSet">Returns the most common synset for the word</param>
        /// <param name="wordNet">The WordNet instance</param>
        /// <returns>Synset shells for the given index line</returns>
        /// <exception cref="System.Exception">Failed to get most common synset</exception>
        internal static List <SynSet> GetSynSetShells(string wordIndexLine, WordNetPos pos, out SynSet mostCommonSynSet, WordNet wordNet)
        {
            var synsets = new List <SynSet>();

            mostCommonSynSet = null;

            // get number of synsets
            var parts      = wordIndexLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            var numSynSets = int.Parse(parts[2]);

            // grab each synset shell, from last to first
            int firstOffsetIndex = parts.Length - numSynSets;

            for (int i = parts.Length - 1; i >= firstOffsetIndex; --i)
            {
                // create synset
                int offset = int.Parse(parts[i]);

                // add synset to collection
                var synset = new SynSet(pos, offset, wordNet);
                synsets.Add(synset);

                // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset
                if (i == firstOffsetIndex)
                {
                    mostCommonSynSet = synset;
                }
            }

            if (mostCommonSynSet == null)
            {
                throw new Exception("Failed to get most common synset");
            }

            return(synsets);
        }
Esempio n. 28
0
        public static LinkedList <EnumItem <Corpus.Tag.NT> > roleTag(LinkedList <Vertex> vertexList, WordNet wordNetAll)
        {
            LinkedList <EnumItem <Corpus.Tag.NT> > tagList = new LinkedList <EnumItem <Corpus.Tag.NT> >();

            foreach (Vertex vertex in vertexList)
            {
                // 构成更长的
                Nature nature = vertex.guessNature();
                switch (nature)
                {
                case Nature.nz:
                {
                    if (vertex.getAttribute().totalFrequency <= 1000)
                    {
                        tagList.AddLast(new EnumItem <Corpus.Tag.NT>(Corpus.Tag.NT.F, 1000));
                    }
                    else
                    {
                        break;
                    }
                }
                    continue;

                case Nature.ni:
                case Nature.nic:
                case Nature.nis:
                case Nature.nit:
                {
                    EnumItem <Corpus.Tag.NT> ntEnumItem = new EnumItem <Corpus.Tag.NT>(Corpus.Tag.NT.K, 1000);
                    ntEnumItem.addLabel(Corpus.Tag.NT.D, 1000);
                    tagList.AddLast(ntEnumItem);
                }
                    continue;

                case Nature.m:
                {
                    EnumItem <Corpus.Tag.NT> ntEnumItem = new EnumItem <Corpus.Tag.NT>(Corpus.Tag.NT.M, 1000);
                    tagList.AddLast(ntEnumItem);
                }
                    continue;
                }

                EnumItem <Corpus.Tag.NT> NTEnumItem = OrganizationDictionary.dictionary.get(vertex.word);  // 此处用等效词,更加精准
                if (NTEnumItem == null)
                {
                    NTEnumItem = new EnumItem <Corpus.Tag.NT>(Corpus.Tag.NT.Z, OrganizationDictionary.transformMatrixDictionary.getTotalFrequency(Corpus.Tag.NT.Z));
                }
                tagList.AddLast(NTEnumItem);
            }
            return(tagList);
        }
Esempio n. 29
0
        private static void insert(List <Vertex> listIterator, List <EnumItem <Corpus.Tag.NS> > tagList, WordNet wordNetAll, int line, Corpus.Tag.NS ns)
        {
            Vertex vertex = wordNetAll.getFirst(line);

            //assert vertex != null : "全词网居然有空白行!";
            listIterator.Add(vertex);
            tagList.Add(new EnumItem <Corpus.Tag.NS>(ns, 1000));
        }
Esempio n. 30
0
        /// <summary>
        /// Gets synset shells from a word index line. A synset shell is an instance of SynSet with only the POS and Offset
        /// members initialized. These members are enough to look up the full synset within the corresponding data file. This
        /// method is static to prevent inadvertent references to a current WordNetEngine, which should be passed via the
        /// corresponding parameter.
        /// </summary>
        /// <param name="wordIndexLine">Word index line from which to get synset shells</param>
        /// <param name="pos">POS of the given index line</param>
        /// <param name="mostCommonSynSet">Returns the most common synset for the word</param>
        /// <param name="wordNet">The WordNet instance</param>
        /// <returns>Synset shells for the given index line</returns>
        /// <exception cref="System.Exception">Failed to get most common synset</exception>
        internal static List<SynSet> GetSynSetShells(string wordIndexLine, WordNetPos pos, out SynSet mostCommonSynSet, WordNet wordNet) {
            var synsets = new List<SynSet>();
            mostCommonSynSet = null;

            // get number of synsets
            var parts = wordIndexLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            var numSynSets = int.Parse(parts[2]);

            // grab each synset shell, from last to first
            int firstOffsetIndex = parts.Length - numSynSets;
            for (int i = parts.Length - 1; i >= firstOffsetIndex; --i) {
                // create synset
                int offset = int.Parse(parts[i]);

                // add synset to collection                        
                var synset = new SynSet(pos, offset, wordNet);
                synsets.Add(synset);

                // if this is the last synset offset to get (since we grabbed them in reverse order), record it as the most common synset
                if (i == firstOffsetIndex)
                    mostCommonSynSet = synset;
            }

            if (mostCommonSynSet == null)
                throw new Exception("Failed to get most common synset");

            return synsets;
        }
Esempio n. 31
0
 /// <summary>
 /// Initializes the WordNet provider.
 /// </summary>
 /// <param name="wordnet">The WordNet.</param>
 public void Initialize(WordNet wordnet) {
     wordNet = wordnet;
 }
Esempio n. 32
0
        public static LinkedList <EnumItem <Corpus.Tag.NS> > roleTag(LinkedList <Vertex> vertexList, WordNet wordNetAll)
        {
            LinkedList <EnumItem <Corpus.Tag.NS> > tagList = new LinkedList <EnumItem <Corpus.Tag.NS> >();

            LinkedList <Vertex> .Enumerator listIterator = vertexList.GetEnumerator();
            //        int line = 0;
            while (listIterator.MoveNext())
            {
                Vertex vertex = listIterator.Current;
                // 构成更长的
                //            if (Nature.ns == vertex.getNature() && vertex.getAttribute().totalFrequency <= 1000)
                //            {
                //                String value = vertex.realWord;
                //                int longestSuffixLength = PlaceSuffixDictionary.dictionary.getLongestSuffixLength(value);
                //                int wordLength = value.length() - longestSuffixLength;
                //                if (longestSuffixLength != 0 && wordLength != 0)
                //                {
                //                    listIterator.remove();
                //                    for (int l = 0, tag = NS.D.ordinal(); l < wordLength; ++l, ++tag)
                //                    {
                //                        listIterator.add(wordNetAll.getFirst(line + l));
                //                        tagList.add(new EnumItem<>(NS.values()[tag], 1000));
                //                    }
                //                    listIterator.add(wordNetAll.get(line + wordLength, longestSuffixLength));
                //                    tagList.add(new EnumItem<>(NS.H, 1000));
                //                    line += vertex.realWord.length();
                //                    continue;
                //                }
                //            }
                if (Nature.ns == vertex.getNature() && vertex.getAttribute().totalFrequency <= 1000)
                {
                    if (vertex.realWord.Length < 3)               // 二字地名,认为其可以再接一个后缀或前缀
                    {
                        tagList.AddLast(new EnumItem <Corpus.Tag.NS>(Corpus.Tag.NS.H, (int)Corpus.Tag.NS.G));
                    }
                    else
                    {
                        tagList.AddLast(new EnumItem <Corpus.Tag.NS>(Corpus.Tag.NS.G));        // 否则只可以再加后缀
                    }
                    continue;
                }
                EnumItem <Corpus.Tag.NS> NSEnumItem = PlaceDictionary.dictionary.get(vertex.word);  // 此处用等效词,更加精准
                if (NSEnumItem == null)
                {
                    NSEnumItem = new EnumItem <Corpus.Tag.NS>(Corpus.Tag.NS.Z, PlaceDictionary.transformMatrixDictionary.getTotalFrequency(Corpus.Tag.NS.Z));
                }
                tagList.AddLast(NSEnumItem);
                //            line += vertex.realWord.length();
            }
            return(tagList);
        }