Exemplo n.º 1
0
        //逆向最长匹配
        public string BackSplitting(RowFirstDynamicArray <ChainContent> m_segGraph)
        {
            string abc  = "";
            int    nCol = m_segGraph.ColumnCount - 1;
            int    nRow = m_segGraph.RowCount - 1;

            //ChainItem<ChainContent> dfg = m_segGraph.GetElement(m_segGraph.RowCount-1, m_segGraph.ColumnCount-1);
            //List<ChainItem<ChainContent>> ab = new List<ChainItem<ChainContent>>();
            while (nCol > 1)
            {
                for (int i = 0; i <= nRow; i++)
                {
                    if (null != m_segGraph.GetElement(i, nCol))
                    {
                        if (abc == "")
                        {
                            abc = m_segGraph.GetElement(i, nCol).Content.sWord;
                        }
                        else
                        {
                            abc = m_segGraph.GetElement(i, nCol).Content.sWord + "/" + abc;
                        } nCol = i;
                        break;
                    }
                }
            }
            return(abc);
        }
Exemplo n.º 2
0
        public static void TestBiGraphGenerate()
        {
            WordDictionary coreDict = new WordDictionary();

            if (!coreDict.Load(coreDictFile))
            {
                Console.WriteLine("coreDict 字典装入错误!");
                return;
            }

            WordDictionary biDict = new WordDictionary();

            if (!biDict.Load(biDictFile))
            {
                Console.WriteLine("字典装入错误!");
                return;
            }

            string sSentence = @"他说的确实在理";

            sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

            //---原子分词
            List <AtomNode> atomSegment = Segment.AtomSegment(sSentence);

            //---检索词库,加入所有可能分词方案并存入链表结构
            RowFirstDynamicArray <ChainContent> segGraph = Segment.GenerateWordNet(atomSegment, coreDict);

            //---检索所有可能的两两组合
            ColumnFirstDynamicArray <ChainContent> biGraphResult = Segment.BiGraphGenerate(segGraph, 0.1, biDict, coreDict);

            Console.WriteLine(biGraphResult.ToString());
        }
Exemplo n.º 3
0
        /// <summary>
        /// 得到所有可能的分词方案
        /// </summary>
        /// <returns></returns>
        public RowFirstDynamicArray <ChainContent> GetSegGraph(string sSentence)
        {
            WordDictionary coreDict = new WordDictionary();

            if (!coreDict.Load(coreDictFile))
            {
                Console.WriteLine("字典装入错误!");
                return(null);
            }

            //string sSentence = @"他说的确实实在";
            sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

            List <AtomNode> atomSegment = Segment.AtomSegment(sSentence);
            RowFirstDynamicArray <ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict);

            return(m_segGraph);
        }
Exemplo n.º 4
0
        public static void TestGenerateWordNet()
        {
            WordDictionary coreDict = new WordDictionary();

            if (!coreDict.Load(coreDictFile))
            {
                Console.WriteLine("字典装入错误!");
                return;
            }

            string sSentence = @"人民币现在很值钱";

            sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

            List <AtomNode> atomSegment = Segment.AtomSegment(sSentence);
            RowFirstDynamicArray <ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict);

            Console.WriteLine(m_segGraph.ToString());
        }
Exemplo n.º 5
0
      public int BiSegment(string sSentence, double smoothPara, int nKind)
      {
         WordResult[] tmpResult;
         WordLinkedArray linkedArray;

         if (biDict == null || coreDict == null)
            throw new Exception("biDict 或 coreDict 尚未初始化!");

         //---原子分词
         atomSegment = AtomSegment(sSentence);
         OnAtomSegment(atomSegment);

         //---检索词库,加入所有可能分词方案并存入链表结构
         segGraph = GenerateWordNet(atomSegment, coreDict);
         OnGenSegGraph(segGraph);

         //---检索所有可能的两两组合
         biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict);
         OnGenBiSegGraph(biGraphResult);

         //---N 最短路径计算出多个分词方案
         NShortPath.Calculate(biGraphResult, nKind);
         List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM);
         OnNShortPath(spResult, segGraph);

         m_pWordSeg = new List<WordResult[]>();
         m_graphOptimum = new RowFirstDynamicArray<ChainContent>();

         for (int i = 0; i < spResult.Count; i++)
         {
            linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
            tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum);

            if (tmpResult != null)
               m_pWordSeg.Add(tmpResult);
         }

         OnBeforeOptimize(m_pWordSeg);

         return m_pWordSeg.Count;
      }
Exemplo n.º 6
0
        //正向最长匹配
        public string ForwardSplitting(RowFirstDynamicArray <ChainContent> m_segGraph)
        {
            string abc = "";
            // =GetSegGraph();
            int currcol = 0;
            ChainItem <ChainContent> dfg = m_segGraph.GetElement(0, 1);
            ChainItem <ChainContent> aa  = dfg.next;

            while (null != aa.next)
            {
                if (aa.next.row == aa.row)
                {
                    currcol = aa.next.col;
                    aa      = aa.next;
                }
                else
                {
                    abc    += aa.Content.sWord;
                    currcol = aa.col;
                    aa      = m_segGraph.GetFirstElementOfRow(currcol);
                    break;
                }
            }

            while (null != aa.next)
            {
                if (aa.next.row == aa.row)
                {
                    currcol = aa.next.col;
                    aa      = aa.next;
                }
                else
                {
                    abc    += "/" + aa.Content.sWord;
                    currcol = aa.col;
                    aa      = m_segGraph.GetFirstElementOfRow(currcol);
                }
            }
            return(abc);
        }
Exemplo n.º 7
0
        //正向最长匹配
        public string ForwardSplitting(RowFirstDynamicArray<ChainContent> m_segGraph)
        {
            string abc = "";
            // =GetSegGraph();
            int currcol = 0;
            ChainItem<ChainContent> dfg= m_segGraph.GetElement(0, 1);
            ChainItem<ChainContent> aa = dfg.next;
            while (null != aa.next)
            {
                if (aa.next.row == aa.row)
                {
                    currcol = aa.next.col;
                    aa = aa.next;
                }
                else
                {
                    abc += aa.Content.sWord  ;
                    currcol = aa.col;
                    aa = m_segGraph.GetFirstElementOfRow(currcol);
                    break;
                }
            }

            while (null != aa.next)
            {
                if (aa.next.row == aa.row)
                {
                    currcol = aa.next.col;
                    aa = aa.next;
                }
                else
                {
                    abc += "/"+aa.Content.sWord ;
                    currcol = aa.col;
                    aa = m_segGraph.GetFirstElementOfRow(currcol);
                }

            }
            return abc;
        }
Exemplo n.º 8
0
        public static string Foo()
        {
            test   abc      = new test();
            string biaodian = @"—()、,!“”《》『』";

            string        path      = @"D:\txt2.txt";
            StringBuilder outBuffer = new StringBuilder();

            try
            {
                Encoding fileEncoding = Encoding.GetEncoding("GB2312");
                using (StreamReader sr = new StreamReader(path, fileEncoding))
                {
                    while (sr.Peek() >= 0)
                    {
                        char dfg = (char)sr.Read();
                        //string dfg = Convert.ToString((char)sr.Read());
                        if (biaodian.IndexOf(dfg) >= 0)
                        {
                            //abc.BackSplitting(outBuffer.ToString());

                            RowFirstDynamicArray <ChainContent> de = abc.GetSegGraph(outBuffer.ToString());
                            //string hyn= abc.ForwardSplitting(de);
                            WriteToTxt(abc.BackSplitting(de));
                            WriteToTxt(Convert.ToString(dfg));
                            outBuffer.Remove(0, outBuffer.Length);
                            continue;
                        }
                        outBuffer.Append(dfg);
                    }
                }
            }
            catch (Exception ex)
            {
                outBuffer.AppendFormat("The process failed: {0}", ex.ToString()).AppendLine();
            }
            //
            return(outBuffer.ToString());
        }
Exemplo n.º 9
0
 //逆向最长匹配
 public string BackSplitting(RowFirstDynamicArray<ChainContent> m_segGraph)
 {
     string abc = "";
     int nCol = m_segGraph.ColumnCount-1;
     int nRow = m_segGraph.RowCount-1;
     //ChainItem<ChainContent> dfg = m_segGraph.GetElement(m_segGraph.RowCount-1, m_segGraph.ColumnCount-1);
     //List<ChainItem<ChainContent>> ab = new List<ChainItem<ChainContent>>();
     while (nCol > 1)
     {
         for (int i = 0; i <=nRow; i++)
         {
             if (null != m_segGraph.GetElement(i, nCol))
             {
                 if (abc == "") { abc = m_segGraph.GetElement(i, nCol).Content.sWord; }
                 else{
                 abc = m_segGraph.GetElement(i, nCol).Content.sWord + "/" + abc;
                 }nCol = i;
                 break;
             }
         }
     }
     return abc;
 }
      //Unknown word recognition
      //pWordSegResult:word Segmentation result;
      //graphOptimum: The optimized segmentation graph
      //graphSeg: The original segmentation graph
      public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray<ChainContent> graphOptimum,
         List<AtomNode> atomSegment, WordDictionary dictCore)
      {
         ChainItem<ChainContent> item;
         int nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal;
         double dValue;
         m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict);
         //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary
         for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++)
         {
            while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0])
               nStartPos += atomSegment[j++].sWord.Length;

            nAtomStart = j;
            while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1])
               nStartPos += atomSegment[j++].sWord.Length;

            nAtomEnd = j;
            if (nAtomStart < nAtomEnd)
            {
               item = graphOptimum.GetElement(nAtomStart, nAtomEnd);
               if (item != null)
               {
                  dValue = item.Content.eWeight;
                  nPOSOriginal = item.Content.nPOS;
               }
               else
                  dValue = Predefine.INFINITE_VALUE;

               if (dValue > m_roleTag.m_dWordsPossibility[i])
                  //Set the element with less frequency
                  graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags));
            }
         }
         return true;
      }
Exemplo n.º 11
0
        //====================================================================
        // Func Name  : GenerateWordNet
        // Description: Generate the segmentation word net according
        //              the original sentence
        // Parameters : sSentence: the sentence
        //              dictCore : core dictionary
        //              bOriginalFreq=false: output original frequency
        // Returns    : bool
        //====================================================================
        public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict)
        {
            string sWord = "", sMaxMatchWord;
             int nPOSRet, nPOS, nTotalFreq;
             double dValue = 0;

             RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>();
             m_segGraph.SetEmpty();

             // ��ԭ�Ӳ��ִ���m_segGraph
             for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
             {
            if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
               m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
            else
            {
               sWord = atomSegment[i].sWord;//init the word
               dValue = Predefine.MAX_FREQUENCE;
               switch (atomSegment[i].nPOS)
               {
                  case Predefine.CT_INDEX:
                  case Predefine.CT_NUM:
                     nPOS = -27904;//'m'*256
                     sWord = "δ##��";
                     dValue = 0;
                     break;
                  case Predefine.CT_DELIMITER:
                     nPOS = 30464;//'w'*256;
                     break;
                  case Predefine.CT_LETTER:
                     nPOS = -28280; // -'n' * 256 - 'x';
                     dValue = 0;
                     sWord = "δ##��";
                     break;
                  case Predefine.CT_SINGLE://12021-2129-3121
                     if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))����//ƥ�両����
                     {
                        nPOS = -27904;//'m'*256
                        sWord = "δ##��";
                     }
                     else
                     {
                        nPOS = -28280; // -'n' * 256 - 'x'
                        sWord = "δ##��";
                     }
                     dValue = 0;
                     break;
                  default:
                     nPOS = atomSegment[i].nPOS;//'?'*256;
                     break;
               }
               m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
            }
             }

             // �����п��ܵ���ʴ���m_segGraph
             for (int i = 0; i < atomSegment.Count; i++)//All the word
             {
            sWord = atomSegment[i].sWord;//Get the current atom
            int j = i + 1;

            while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
            {
               if (sMaxMatchWord == sWord)  // ��������Ҫ�ҵĴ�
               {
                  WordInfo info = coreDict.GetWordInfo(sWord); // �ôʿ��ܾ��ж��ִ���

                  // ����ôʵ����д�Ƶ֮��
                  nTotalFreq = 0;
                  for (int k = 0; k < info.Count; k++)
                     nTotalFreq += info.Frequencies[k];

                  // ���Ƴ���ijЩ�����
                  if (sWord.Length == 2 && (sWord.StartsWith("��") || sWord.StartsWith("��")) && i >= 1 &&
                     (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                     Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                  {
                     //1���ڡ�1999��ĩ
                     if ("ĩ���е�ǰ���".IndexOf(sWord.Substring(1)) >= 0)
                        break;
                  }

                  // ����ô�ֻ��һ�����ԣ���洢��������Լ�¼Ϊ 0
                  if (info.Count == 1)
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                  else
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
               }

               sWord += atomSegment[j++].sWord;
            }
             }
             return m_segGraph;
        }
Exemplo n.º 12
0
 private void OnOptimumSegment(RowFirstDynamicArray<ChainContent> m_graphOptimum)
 {
     SendEvents(new SegmentEventArgs(SegmentStage.OptimumSegment, m_graphOptimum.ToString()));
 }
Exemplo n.º 13
0
        private void OnNShortPath(List<int[]> paths, RowFirstDynamicArray<ChainContent> segGraph)
        {
            List<ChainItem<ChainContent>> list = segGraph.ToListItems();
             string theWord;

             int[] aPath;
             StringBuilder sb = new StringBuilder();

             for (int i = 0; i < paths.Count; i++)
             {
            aPath = paths[i];
            for (int j = 0; j < aPath.Length; j++)
            {
               theWord = list[aPath[j]].Content.sWord;
               if (theWord == "δ##��" || theWord == "δ##��" || theWord == "δ##��" || theWord == "δ##ʱ" || theWord == "δ##��")
               {
                  for (int k = list[aPath[j]].row; k < list[aPath[j]].col; k++)
                     sb.Append(atomSegment[k].sWord);
                  sb.Append(", ");
               }
               else
                  sb.Append(string.Format("{0}, ", list[aPath[j]].Content.sWord));
            }

            sb.Append("\r\n");
             }

             SendEvents(new SegmentEventArgs(SegmentStage.NShortPath, sb.ToString()));
        }
Exemplo n.º 14
0
 private void OnGenSegGraph(RowFirstDynamicArray<ChainContent> segGraph)
 {
     SendEvents(new SegmentEventArgs(SegmentStage.GenSegGraph, segGraph.ToString()));
 }
Exemplo n.º 15
0
        //====================================================================
        // ׼��PositionMap�����ڼ�¼�ʵ�λ��
        //====================================================================
        private static int[] PreparePositionMap(RowFirstDynamicArray<ChainContent> aWord)
        {
            int[] m_npWordPosMapTable;
             ChainItem<ChainContent> pTail, pCur;
             int nWordIndex = 0, m_nWordCount;

             //Get tail element and return the words count
             m_nWordCount = aWord.GetTail(out pTail);

             if (m_nWordCount > 0)
            m_npWordPosMapTable = new int[m_nWordCount];
             else
            m_npWordPosMapTable = null;

             //Record the  position of possible words
             pCur = aWord.GetHead();
             while (pCur != null)
             {
            m_npWordPosMapTable[nWordIndex++] = pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col;
            pCur = pCur.next;
             }

             return m_npWordPosMapTable;
        }
Exemplo n.º 16
0
        //====================================================================
        // Generate Word according the segmentation route
        //====================================================================
        private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum)
        {
            if (linkedArray.Count == 0)
            return null;

             //--------------------------------------------------------------------
             //Merge all seperate continue num into one number
             MergeContinueNumIntoOne(ref linkedArray);

             //--------------------------------------------------------------------
             //The delimiter "����"
             ChangeDelimiterPOS(ref linkedArray);

             //--------------------------------------------------------------------
             //���ǰһ���������֣���ǰ���ԡ�������-����ʼ�����Ҳ�ֹ��һ���ַ���
             //��ô���ˡ��������Ŵӵ�ǰ���з��������
             //���� ��3 / -4 / �¡���Ҫ��ֳɡ�3 / - / 4 / �¡�
             SplitMiddleSlashFromDigitalWords(ref linkedArray);

             //--------------------------------------------------------------------
             //1�������ǰ�������֣���һ�����ǡ��¡��ա�ʱ���֡��롢�·ݡ��е�һ������ϲ�,�ҵ�ǰ�ʴ�����ʱ��
             //2�������ǰ���ǿ�����Ϊ��ݵ����֣���һ�����ǡ��ꡱ����ϲ�������Ϊʱ�䣬����Ϊ���֡�
             //3��������һ��������"��" ������Ϊ��ǰ������ʱ��
             //4�������ǰ�����һ�����ֲ���"�á�����"�Ͱ�ǵ�'.''/'����ô����
             //5����ǰ�����һ��������"�á�����"�Ͱ�ǵ�'.''/'���ҳ��ȴ���1����ôȥ�����һ���ַ�������"1."
             CheckDateElements(ref linkedArray);

             //--------------------------------------------------------------------
             //������
             WordResult[] result = new WordResult[linkedArray.Count];

             WordNode pCur = linkedArray.first;
             int i = 0;
             while (pCur != null)
             {
            WordResult item = new WordResult();
            item.sWord = pCur.theWord.sWord;
            item.nPOS = pCur.theWord.nPOS;
            item.dValue = pCur.theWord.dValue;
            item.sLocation = pCur.row;
            result[i] = item;

            m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));

            pCur = pCur.next;
            i++;
             }

             return result;
        }
Exemplo n.º 17
0
        //====================================================================
        // ��BiPathת��ΪLinkedArray
        // ���硰��˵��ȷʵ�����
        // BiPath����0, 1, 2, 3, 6, 9, 11, 12��
        //    0    1   2   3   4     5   6     7   8     9   10    11  12
        // ʼ##ʼ  ��  ˵  ��  ��ȷ  ȷ  ȷʵ  ʵ  ʵ��  ��  ����  ��  ĩ##ĩ
        //====================================================================
        private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray<ChainContent> segGraph, List<AtomNode> atomSegment)
        {
            List<ChainItem<ChainContent>> list = segGraph.ToListItems();
             StringBuilder sb = new StringBuilder();

             WordLinkedArray result = new WordLinkedArray();

             for (int i = 0; i < biPath.Length; i++)
             {
            WordNode node = new WordNode();

            node.row = list[biPath[i]].row;
            node.col = list[biPath[i]].col;
            node.sWordInSegGraph = list[biPath[i]].Content.sWord;

            node.theWord = new WordResult();
            if (node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##��" ||
               node.sWordInSegGraph == "δ##��" || node.sWordInSegGraph == "δ##ʱ" || node.sWordInSegGraph == "δ##��")
            {
               sb.Remove(0, sb.Length);
               for (int j = node.row; j < node.col; j++)
                  sb.Append(atomSegment[j].sWord);

               node.theWord.sWord = sb.ToString();
            }
            else
               node.theWord.sWord = list[biPath[i]].Content.sWord;

            node.theWord.nPOS = list[biPath[i]].Content.nPOS;
            node.theWord.dValue = list[biPath[i]].Content.eWeight;

            result.AppendNode(node);
             }

             return result;
        }
Exemplo n.º 18
0
        public int BiSegment(string sSentence, double smoothPara, int nKind)
        {
            WordResult[] tmpResult;
             WordLinkedArray linkedArray;

             if (biDict == null || coreDict == null)
            throw new Exception("biDict �� coreDict ��δ��ʼ����");

             //---ԭ�ӷִ�
             atomSegment = AtomSegment(sSentence);
             OnAtomSegment(atomSegment);

             //---�����ʿ⣬�������п��ִܷʷ�������������ṹ
             segGraph = GenerateWordNet(atomSegment, coreDict);
             OnGenSegGraph(segGraph);

             //---�������п��ܵ��������
             biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict);
             OnGenBiSegGraph(biGraphResult);

             //---N ���·�����������ִʷ���
             NShortPath.Calculate(biGraphResult, nKind);
             List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM);
             OnNShortPath(spResult, segGraph);

             m_pWordSeg = new List<WordResult[]>();
             m_graphOptimum = new RowFirstDynamicArray<ChainContent>();

             for (int i = 0; i < spResult.Count; i++)
             {
            linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
            tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum);

            if (tmpResult != null)
               m_pWordSeg.Add(tmpResult);
             }

             OnBeforeOptimize(m_pWordSeg);

             return m_pWordSeg.Count;
        }
Exemplo n.º 19
0
        public ColumnFirstDynamicArray<ChainContent> TestSegment(string sSentence, double smoothPara, int nKind)
        {
            WordResult[] tmpResult;
              WordLinkedArray linkedArray;

              if (biDict == null || coreDict == null)
              throw new Exception("biDict �� coreDict ��δ��ʼ����");

              //---ԭ�ӷִ�
              atomSegment = AtomSegment(sSentence);
              OnAtomSegment(atomSegment);

              //---�����ʿ⣬�������п��ִܷʷ�������������ṹ
              segGraph = GenerateWordNet(atomSegment, coreDict);
              //OnGenSegGraph(segGraph);

              //---�������п��ܵ��������
              biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict);
              //OnGenBiSegGraph(biGraphResult);

              return biGraphResult;
              //--����ƥ���Ż�
              //biGraphResult = BackwardOptimize(biGraphResult);
              //OnBackwardOptimize(biGraphResult);
        }
Exemplo n.º 20
0
        //====================================================================
        // ����������֮��Ķ���ͼ��
        //====================================================================
        public static ColumnFirstDynamicArray<ChainContent> BiGraphGenerate(
         RowFirstDynamicArray<ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict)
        {
            ColumnFirstDynamicArray<ChainContent> aBiWordNet = new ColumnFirstDynamicArray<ChainContent>();

             ChainItem<ChainContent> pCur, pNextWords;
             int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
             double dCurFreqency, dValue, dTemp;
             string sTwoWords;
             StringBuilder sb = new StringBuilder();

             //Record the position map of possible words
             int[] m_npWordPosMapTable = PreparePositionMap(aWord);

             pCur = aWord.GetHead();
             while (pCur != null)
             {
            if (pCur.Content.nPOS >= 0)
               //It's not an unknown words
               dCurFreqency = pCur.Content.eWeight;
            else
               //Unknown words
               dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2);

            //Get next words which begin with pCur.col��ע��������Ķ�Ӧ��ϵ��
            pNextWords = aWord.GetFirstElementOfRow(pCur.col);

            while (pNextWords != null && pNextWords.row == pCur.col)
            {
               sb.Remove(0, sb.Length);
               sb.Append(pCur.Content.sWord);
               sb.Append(Predefine.WORD_SEGMENTER);
               sb.Append(pNextWords.Content.sWord);

               sTwoWords = sb.ToString();

               //Two linked Words frequency
               nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3);

               //Smoothing
               dTemp = 1.0 / Predefine.MAX_FREQUENCE;

               //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
               dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0)
                 + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) +
                 dTemp));

               //Unknown words: P(Wi|Ci);while known words:1
               if (pCur.Content.nPOS < 0)
                  dValue += pCur.Content.nPOS;

               //Get the position index of current word in the position map table
               nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable);
               nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable);

               aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords));

               pNextWords = pNextWords.next; //Get next word
            }
            pCur = pCur.next;
             }

             return aBiWordNet;
        }
Exemplo n.º 21
0
      //====================================================================
      // Generate Word according the segmentation route
      //====================================================================
      private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum)
      {
         if (linkedArray.Count == 0)
            return null;

         //--------------------------------------------------------------------
         //Merge all seperate continue num into one number
         MergeContinueNumIntoOne(ref linkedArray);

         //--------------------------------------------------------------------
         //The delimiter "--"
         ChangeDelimiterPOS(ref linkedArray);

         //--------------------------------------------------------------------
         //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
         //那么将此“-”符号从当前词中分离出来。
         //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
         SplitMiddleSlashFromDigitalWords(ref linkedArray);

         //--------------------------------------------------------------------
         //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间
         //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
         //3、如果最后一个汉字是"点" ,则认为当前数字是时间
         //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
         //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
         CheckDateElements(ref linkedArray);

         //--------------------------------------------------------------------
         //输出结果
         WordResult[] result = new WordResult[linkedArray.Count];

         WordNode pCur = linkedArray.first;
         int i = 0;
         while (pCur != null)
         {
            WordResult item = new WordResult();
            item.sWord = pCur.theWord.sWord;
            item.nPOS = pCur.theWord.nPOS;
            item.dValue = pCur.theWord.dValue;
            result[i] = item;

            m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));

            pCur = pCur.next;
            i++;
         }

         return result;
      }
Exemplo n.º 22
0
      //====================================================================
      // Func Name  : GenerateWordNet
      // Description: Generate the segmentation word net according 
      //              the original sentence
      // Parameters : sSentence: the sentence
      //              dictCore : core dictionary
      //              bOriginalFreq=false: output original frequency
      // Returns    : bool
      //====================================================================
      public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict)
      {
         string sWord = "", sMaxMatchWord;
         int nPOSRet, nPOS, nTotalFreq;
         double dValue = 0;

         RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>();
         m_segGraph.SetEmpty();

         // 将原子部分存入m_segGraph
         for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
         {
            if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
               m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
            else
            {
               sWord = atomSegment[i].sWord;//init the word 
               dValue = Predefine.MAX_FREQUENCE;
               switch (atomSegment[i].nPOS)
               {
                  case Predefine.CT_INDEX:
                  case Predefine.CT_NUM:
                     nPOS = -27904;//'m'*256
                     sWord = "未##数";
                     dValue = 0;
                     break;
                  case Predefine.CT_DELIMITER:
                     nPOS = 30464;//'w'*256;
                     break;
                  case Predefine.CT_LETTER:
                     nPOS = -28280; // -'n' * 256 - 'x';
                     dValue = 0;
                     sWord = "未##串";
                     break;
                  case Predefine.CT_SINGLE://12021-2129-3121
                     if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))  //匹配浮点数
                     {
                        nPOS = -27904;//'m'*256
                        sWord = "未##数";
                     }
                     else
                     {
                        nPOS = -28280; // -'n' * 256 - 'x'
                        sWord = "未##串";
                     }
                     dValue = 0;
                     break;
                  default:
                     nPOS = atomSegment[i].nPOS;//'?'*256;
                     break;
               }
               m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
            }
         }

         // 将所有可能的组词存入m_segGraph
         for (int i = 0; i < atomSegment.Count; i++)//All the word
         {
            sWord = atomSegment[i].sWord;//Get the current atom
            int j = i + 1;

            while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
            {
               if (sMaxMatchWord == sWord)  // 就是我们要找的词
               {
                  WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性

                  // 计算该词的所有词频之和
                  nTotalFreq = 0;
                  for (int k = 0; k < info.Count; k++)
                     nTotalFreq += info.Frequencies[k];

                  // 限制出现某些特殊词
                  if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 &&
                     (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                     Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                  {
                     //1年内、1999年末
                     if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0)
                        break;
                  }

                  // 如果该词只有一个词性,则存储,否则词性记录为 0
                  if (info.Count == 1)
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                  else
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
               }

               sWord += atomSegment[j++].sWord;
            }
         }
         return m_segGraph;
      }
 private void OnPersonAndPlaceRecognition(RowFirstDynamicArray<ChainContent> m_graphOptimum)
 {
    SendEvents(new SegmentEventArgs(SegmentStage.PersonAndPlaceRecognition, m_graphOptimum.ToString()));
 }
Exemplo n.º 24
0
        public int BiOptimumSegment(int nResultCount, double dSmoothingPara)
        {
            WordResult[] tmpResult;
             WordLinkedArray linkedArray;

             //Generate the biword link net
             ColumnFirstDynamicArray<ChainContent> aBiwordsNet = BiGraphGenerate(m_graphOptimum, dSmoothingPara, biDict, coreDict);
             OnGenBiOptimumSegGraph(aBiwordsNet);

             NShortPath.Calculate(aBiwordsNet, nResultCount);
             List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM);

             m_pWordSeg = new List<WordResult[]>();
             segGraph = m_graphOptimum;
             m_graphOptimum = new RowFirstDynamicArray<ChainContent>();

             for (int i = 0; i < spResult.Count; i++)
             {
            linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
            tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum);

            if (tmpResult != null)
               m_pWordSeg.Add(tmpResult);
             }

             return m_pWordSeg.Count;
        }