Example #1
0
        //====================================================================
        // Func Name  : GenerateWordNet
        // Description: Generate the segmentation word net according
        //              the original sentence
        // Parameters : sSentence: the sentence
        //              dictCore : core dictionary
        //              bOriginalFreq=false: output original frequency
        // Returns    : bool
        //====================================================================
        public static RowFirstDynamicArray <ChainContent> GenerateWordNet(List <AtomNode> atomSegment, WordDictionary coreDict)
        {
            string sWord = "", sMaxMatchWord;
            int    nPOSRet, nPOS, nTotalFreq;
            double dValue = 0;

            RowFirstDynamicArray <ChainContent> m_segGraph = new RowFirstDynamicArray <ChainContent>();

            m_segGraph.SetEmpty();

            // 将原子部分存入m_segGraph
            for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
            {
                if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
                {
                    m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
                }
                else
                {
                    sWord  = atomSegment[i].sWord;//init the word
                    dValue = Predefine.MAX_FREQUENCE;
                    switch (atomSegment[i].nPOS)
                    {
                    case Predefine.CT_INDEX:
                    case Predefine.CT_NUM:
                        nPOS   = -27904;//'m'*256
                        sWord  = "未##数";
                        dValue = 0;
                        break;

                    case Predefine.CT_DELIMITER:
                        nPOS = 30464;//'w'*256;
                        break;

                    case Predefine.CT_LETTER:
                        nPOS   = -28280; // -'n' * 256 - 'x';
                        dValue = 0;
                        sWord  = "未##串";
                        break;

                    case Predefine.CT_SINGLE://12021-2129-3121
                        if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))
                        {
                                                //匹配浮点数
                            {
                                nPOS  = -27904; //'m'*256
                                sWord = "未##数";
                            }
                        }
                        else
                        {
                            nPOS  = -28280; // -'n' * 256 - 'x'
                            sWord = "未##串";
                        }
                        dValue = 0;
                        break;

                    default:
                        nPOS = atomSegment[i].nPOS;//'?'*256;
                        break;
                    }
                    m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
                }
            }

            // 将所有可能的组词存入m_segGraph
            for (int i = 0; i < atomSegment.Count; i++) //All the word
            {
                sWord = atomSegment[i].sWord;           //Get the current atom
                int j = i + 1;

                while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
                {
                    if (sMaxMatchWord == sWord)                      // 就是我们要找的词
                    {
                        WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性

                        // 计算该词的所有词频之和
                        nTotalFreq = 0;
                        for (int k = 0; k < info.Count; k++)
                        {
                            nTotalFreq += info.Frequencies[k];
                        }

                        // 限制出现某些特殊词
                        if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 &&
                            (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                             Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                        {
                            //1年内、1999年末
                            if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0)
                            {
                                break;
                            }
                        }

                        // 如果该词只有一个词性,则存储,否则词性记录为 0
                        if (info.Count == 1)
                        {
                            m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                        }
                        else
                        {
                            m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
                        }
                    }

                    sWord += atomSegment[j++].sWord;
                }
            }
            return(m_segGraph);
        }
Example #2
0
        //====================================================================
        // Func Name  : GenerateWordNet
        // Description: Generate the segmentation word net according
        //              the original sentence
        // Parameters : sSentence: the sentence
        //              dictCore : core dictionary
        //              bOriginalFreq=false: output original frequency
        // Returns    : bool
        //====================================================================
        public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict)
        {
            string sWord = "", sMaxMatchWord;
             int nPOSRet, nPOS, nTotalFreq;
             double dValue = 0;

             RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>();
             m_segGraph.SetEmpty();

             // ��ԭ�Ӳ��ִ���m_segGraph
             for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
             {
            if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
               m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
            else
            {
               sWord = atomSegment[i].sWord;//init the word
               dValue = Predefine.MAX_FREQUENCE;
               switch (atomSegment[i].nPOS)
               {
                  case Predefine.CT_INDEX:
                  case Predefine.CT_NUM:
                     nPOS = -27904;//'m'*256
                     sWord = "δ##��";
                     dValue = 0;
                     break;
                  case Predefine.CT_DELIMITER:
                     nPOS = 30464;//'w'*256;
                     break;
                  case Predefine.CT_LETTER:
                     nPOS = -28280; // -'n' * 256 - 'x';
                     dValue = 0;
                     sWord = "δ##��";
                     break;
                  case Predefine.CT_SINGLE://12021-2129-3121
                     if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))����//ƥ�両����
                     {
                        nPOS = -27904;//'m'*256
                        sWord = "δ##��";
                     }
                     else
                     {
                        nPOS = -28280; // -'n' * 256 - 'x'
                        sWord = "δ##��";
                     }
                     dValue = 0;
                     break;
                  default:
                     nPOS = atomSegment[i].nPOS;//'?'*256;
                     break;
               }
               m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
            }
             }

             // �����п��ܵ���ʴ���m_segGraph
             for (int i = 0; i < atomSegment.Count; i++)//All the word
             {
            sWord = atomSegment[i].sWord;//Get the current atom
            int j = i + 1;

            while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
            {
               if (sMaxMatchWord == sWord)  // ��������Ҫ�ҵĴ�
               {
                  WordInfo info = coreDict.GetWordInfo(sWord); // �ôʿ��ܾ��ж��ִ���

                  // ����ôʵ����д�Ƶ֮��
                  nTotalFreq = 0;
                  for (int k = 0; k < info.Count; k++)
                     nTotalFreq += info.Frequencies[k];

                  // ���Ƴ���ijЩ�����
                  if (sWord.Length == 2 && (sWord.StartsWith("��") || sWord.StartsWith("��")) && i >= 1 &&
                     (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                     Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                  {
                     //1���ڡ�1999��ĩ
                     if ("ĩ���е�ǰ���".IndexOf(sWord.Substring(1)) >= 0)
                        break;
                  }

                  // ����ô�ֻ��һ�����ԣ���洢��������Լ�¼Ϊ 0
                  if (info.Count == 1)
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                  else
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
               }

               sWord += atomSegment[j++].sWord;
            }
             }
             return m_segGraph;
        }
      //====================================================================
      // Func Name  : GenerateWordNet
      // Description: Generate the segmentation word net according 
      //              the original sentence
      // Parameters : sSentence: the sentence
      //              dictCore : core dictionary
      //              bOriginalFreq=false: output original frequency
      // Returns    : bool
      //====================================================================
      public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict)
      {
         string sWord = "", sMaxMatchWord;
         int nPOSRet, nPOS, nTotalFreq;
         double dValue = 0;

         RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>();
         m_segGraph.SetEmpty();

         // 将原子部分存入m_segGraph
         for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
         {
            if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
               m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
            else
            {
               sWord = atomSegment[i].sWord;//init the word 
               dValue = Predefine.MAX_FREQUENCE;
               switch (atomSegment[i].nPOS)
               {
                  case Predefine.CT_INDEX:
                  case Predefine.CT_NUM:
                     nPOS = -27904;//'m'*256
                     sWord = "未##数";
                     dValue = 0;
                     break;
                  case Predefine.CT_DELIMITER:
                     nPOS = 30464;//'w'*256;
                     break;
                  case Predefine.CT_LETTER:
                     nPOS = -28280; // -'n' * 256 - 'x';
                     dValue = 0;
                     sWord = "未##串";
                     break;
                  case Predefine.CT_SINGLE://12021-2129-3121
                     if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))  //匹配浮点数
                     {
                        nPOS = -27904;//'m'*256
                        sWord = "未##数";
                     }
                     else
                     {
                        nPOS = -28280; // -'n' * 256 - 'x'
                        sWord = "未##串";
                     }
                     dValue = 0;
                     break;
                  default:
                     nPOS = atomSegment[i].nPOS;//'?'*256;
                     break;
               }
               m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
            }
         }

         // 将所有可能的组词存入m_segGraph
         for (int i = 0; i < atomSegment.Count; i++)//All the word
         {
            sWord = atomSegment[i].sWord;//Get the current atom
            int j = i + 1;

            while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
            {
               if (sMaxMatchWord == sWord)  // 就是我们要找的词
               {
                  WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性

                  // 计算该词的所有词频之和
                  nTotalFreq = 0;
                  for (int k = 0; k < info.Count; k++)
                     nTotalFreq += info.Frequencies[k];

                  // 限制出现某些特殊词
                  if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 &&
                     (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                     Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                  {
                     //1年内、1999年末
                     if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0)
                        break;
                  }

                  // 如果该词只有一个词性,则存储,否则词性记录为 0
                  if (info.Count == 1)
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                  else
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
               }

               sWord += atomSegment[j++].sWord;
            }
         }
         return m_segGraph;
      }