Ejemplo n.º 1
0
        public override void ProcessWords(string[] words, object docId)
        {
            var frequency = new IntTable <string, int>();

            frequency.Add(words);
            foreach (DictionaryEntry de in frequency)
            {
                if (de.Value != null)
                {
                    Table[de.Key.ToString()][docId.ToString()] = double.Parse(de.Value.ToString());
                }
            }
        }
Ejemplo n.º 2
0
        private void InsertIndex(long id, string text)
        {
            string dataPath  = "Baike\\Baike_question_index.db4";
            var    frequency = new IntTable <string, int>();

            string[] results = _segment.Split(text);
            if (results.Length < 1)
            {
                return;
            }
            frequency.Add(results);
            foreach (DictionaryEntry de in frequency)
            {
                KeywordIndex keywordIndex = new KeywordIndex();
                keywordIndex.Keyword = de.Key.ToString();
                keywordIndex.Weight  = int.Parse(de.Value.ToString()) * 1.0 / results.Length;
                keywordIndex.Id      = id;
                Indexs.Add(keywordIndex);
            }
            if (Indexs.Count > 0)
            {
                using (IStorageEngine engine = STSdb.FromFile(dataPath))
                {
                    ITable <string, List <Slots <long, double> > > table = engine.OpenXTable <string, List <Slots <long, double> > >("WebPage");
                    foreach (var keywordIndex in Indexs)
                    {
                        // 如果包含则追加
                        List <Slots <long, double> > list = table.Find(keywordIndex.Keyword);
                        if (list != null && list.Count > 0)
                        {
                            Slots <long, double> slot = new Slots <long, double>(keywordIndex.Id, keywordIndex.Weight);
                            list.Add(slot);
                        }
                        // 否则新增
                        else
                        {
                            list = new List <Slots <long, double> >();
                            Slots <long, double> slot = new Slots <long, double>(keywordIndex.Id, keywordIndex.Weight);
                            list.Add(slot);
                            table[keywordIndex.Keyword] = list;
                        }
                    }
                    engine.Commit();
                }
                Indexs.Clear();
            }
        }
Ejemplo n.º 3
0
 /// <summary>
 /// 分析原始网页数据
 /// </summary>
 /// <param name="folder"></param>
 public void AnalysisData(string folder, string filePath)
 {
     // 如果有多余的,需要存放到数据库
     lock (PageObj)
     {
         if (!Directory.Exists(folder))
         {
             Directory.CreateDirectory(folder);
         }
         string fileFlag = filePath;
         using (IStorageEngine engine = STSdb.FromFile(fileFlag))
         {
             // 插入数据
             ITable <string, Crawler.Page> table = engine.OpenXTable <string, Crawler.Page>("WebPage");
             foreach (var keyValuePair in table)
             {
                 Crawler.Page page = (Crawler.Page)keyValuePair.Value;
                 if (page != null && page.Content.Trim().Length > 0)
                 {
                     Console.WriteLine(page.Url);
                     var      frequency = new IntTable <string, int>();
                     string[] results   = segment.Split(page.Title + page.Title + page.Content);
                     if (results.Length < 1)
                     {
                         continue;
                     }
                     frequency.Add(results);
                     foreach (DictionaryEntry de in frequency)
                     {
                         TextIndex textIndex = new TextIndex();
                         textIndex.Keyword = de.Key.ToString();
                         textIndex.Weight  = int.Parse(de.Value.ToString()) * 1.0 / results.Length;
                         textIndex.Url     = page.Url;
                         indexs.Add(textIndex);
                         SaveIndex(ref indexs, folder, false);
                     }
                     pages.Add(page);
                     SaveContent(ref pages, folder, false);
                 }
             }
         }
     }
     SaveIndex(ref indexs, folder, true);
     SaveContent(ref pages, folder, true);
 }
Ejemplo n.º 4
0
        static void Main(string[] args)
        {
            IntTable intTable = new IntTable(9);

            int[] numbers = new int[] { 3, -32, 10, 32, 0, 5, 2, 6, 9, 3 };

            for (int i = 0; i < numbers.Length; i++)
            {
                intTable.Set(i, numbers[i]);
            }

            intTable.Sort();

            for (int i = 0; i < intTable.Length; i++)
            {
                Console.WriteLine(intTable.Get(i));
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// 根据关键字集获取它所在地文档以及在文档中的频率
        /// <example>
        ///  例如传入关键字:“北京 地铁”
        ///  会将二者对应的文档按照同时出现的情况进行合并
        /// </example>
        /// </summary>
        /// <returns> 返回按照频率的集合 </returns>
        public List <TValue> FindCommonDocumentByKeys(TKey[] keys, int maxCount)
        {
            IntTable <string, int> table = new IntTable <string, int>();

            foreach (TKey key in keys)
            {
                List <TKey> temp = FindDocumentByKey(key, false);
                if (temp != null && temp.Count > 0)
                {
                    foreach (var t in temp)
                    {
                        table.Add(t.ToString(), 1, true);
                    }
                }
            }
            if (table.Count < 1)
            {
                return(null);
            }

            List <TValue> result = new List <TValue>();
            ArrayList     list   = new ArrayList(table.Values);

            list.Sort();
            list.Reverse();
            for (int i = 0; i < maxCount && i < list.Count; i++)
            {
                IDictionaryEnumerator ide = table.GetEnumerator();
                while (ide.MoveNext())
                {
                    // TValue k= (TValue)ide.Key;
                    if (ide.Value == list[i]) // && int.Parse(list[i].ToString()) == keys.Length)
                    {
                        result.Add((TValue)ide.Key);
                    }
                }
            }

            return(result);
        }
Ejemplo n.º 6
0
 public InitialStateProbability()
 {
     Table = new IntTable<string, double>();
 }
Ejemplo n.º 7
0
 public InitialStateProbability()
 {
     Table = new IntTable <string, double>();
 }
Ejemplo n.º 8
0
        private static string GetTextResult(string[] keywords)
        {
            try
            {
                // 1.获取相应文档集合
                IntTable<string, double> docsTable = new IntTable<string, double>();
                foreach (var keyword in keywords)
                {
                    object obj = TextIndexData[keyword];
                    if (obj == null)
                    {
                        continue;
                    }
                    List<Slots<string, double>> textIndexs = (List<Slots<string, double>>)obj;
                    if (textIndexs != null)
                    {
                        foreach (var slotse in textIndexs)
                        {
                            docsTable.Add(slotse.Slot0, slotse.Slot1);
                        }
                    }
                }

                // 2.文档排序
                List<string> sortedList = new List<string>();
                ArrayList list = new ArrayList(docsTable.Values);
                list.Sort();
                list.Reverse();
                int maxCount = 9;
                foreach (double svalue in list)
                {
                    maxCount--;
                    if (maxCount < 0)
                    {
                        break;
                    }
                    IDictionaryEnumerator ide = docsTable.GetEnumerator();
                    while (ide.MoveNext())
                    {
                        if (Math.Abs((double)ide.Value - svalue) < 0.00000001)
                        {
                            sortedList.Add(ide.Key.ToString());
                        }
                    }
                }

                // 3.提取文档数据
                List<Crawler.Page> doclist = new List<Crawler.Page>();
                HashSet<string> filter = new HashSet<string>();
                foreach (string url in sortedList)
                {
                    Crawler.Page page = _searchDataTable.Find(url);
                    if (page != null && !filter.Contains(page.Url))
                    {
                        doclist.Add(page);
                        filter.Add(page.Url);
                    }
                }

                // 4.截取摘要返回
                string result = "";
                foreach (var page in doclist)
                {
                    result += BuildTextResult(ChangeColor(keywords, page.Title), CheckContent(page.Content, keywords),
                        page.Url, page.Timestamp, "");
                }

                return result;
            }
            catch (Exception exception)
            {
                return exception.ToString();
            }

        }
Ejemplo n.º 9
0
        private static string GetKnownledgeResult(string[] keywords)
        {
            // 1.获取相应文档集合
            IntTable<long, double> docsTable = new IntTable<long, double>();
            foreach (var keyword in keywords)
            {
                object obj = KnowledgeIndexData[keyword];
                if (obj == null)
                {
                    continue;
                }
                List<Slots<long, double>> dataIndexs = (List<Slots<long, double>>)obj;
                if (dataIndexs != null)
                {
                    foreach (var slotse in dataIndexs)
                    {
                        docsTable.Add(slotse.Slot0, slotse.Slot1);
                    }
                }
            }

            // 2.文档排序
            List<long> sortedList = new List<long>();
            ArrayList list = new ArrayList(docsTable.Values);
            list.Sort();
            list.Reverse();
            int maxCount = 5;
            foreach (double svalue in list)
            {
                maxCount--;
                if (maxCount < 0)
                {
                    break;
                }
                IDictionaryEnumerator ide = docsTable.GetEnumerator();
                while (ide.MoveNext())
                {
                    if (Math.Abs((double)ide.Value - svalue) < 0.00000001)
                    {
                        sortedList.Add((long)ide.Key);
                    }
                }
            }

            // 3.提取文档数据
            List<KnowlegeIndex.KnowledgeEntity> doclist = new List<KnowlegeIndex.KnowledgeEntity>();
            int cou = 1;
            HashSet<long> filter = new HashSet<long>();
            foreach (long url in sortedList)
            {
                KnowlegeIndex.KnowledgeEntity entity = _knowledgeDataTable.Find(url);
                if (entity != null && !filter.Contains(entity.Id) && cou > 0)
                {
                    doclist.Add(entity);
                    filter.Add(entity.Id);
                    cou--;
                }
            }

            // 4.截取摘要返回
            string result = "";
            foreach (var page in doclist)
            {
                result += BuildTextResult(ChangeColor(keywords, "[知]"+page.QuestionDesc), "答案:" + page.Answer,
                    page.RefUrl, page.EffectTime, "");
            }

            return result;
        }
Ejemplo n.º 10
0
        public List <string> Query(Common.Sentence forQueryType, string[] times, string[] places, string[] whoms, string[] events)
        {
            //构建时间
            Entity[] entityTimes = BuildEntity(Common.Sentence.Time, times);

            //构建地点
            Entity[] entityPlaces = BuildEntity(Common.Sentence.Place, places);

            //构建人物
            Entity[] entityWhoms = BuildEntity(Common.Sentence.Whom, whoms);

            //构建事件
            Entity[] entityEvent = BuildEntity(Common.Sentence.Event, events);

            IntTable <Entity, int> result = new IntTable <Entity, int>();

            //时间-地点相关
            List <Entity> tmp = FindRelation(entityTimes, entityPlaces, forQueryType);

            if (tmp != null && tmp.Count > 0)
            {
                result.Add(tmp.ToArray());
            }

            //时间-人物相关
            tmp = FindRelation(entityTimes, entityWhoms, forQueryType);
            if (tmp != null && tmp.Count > 0)
            {
                result.Add(tmp.ToArray());
            }

            //时间-事件相关
            tmp = FindRelation(entityTimes, entityEvent, forQueryType);
            if (tmp != null && tmp.Count > 0)
            {
                result.Add(tmp.ToArray());
            }

            //地点-人物相关
            tmp = FindRelation(entityPlaces, entityWhoms, forQueryType);
            if (tmp != null && tmp.Count > 0)
            {
                result.Add(tmp.ToArray());
            }

            //地点-事件相关
            tmp = FindRelation(entityPlaces, entityEvent, forQueryType);
            if (tmp != null && tmp.Count > 0)
            {
                result.Add(tmp.ToArray());
            }

            //人物-事件相关
            tmp = FindRelation(entityWhoms, entityEvent, forQueryType);
            if (tmp != null && tmp.Count > 0)
            {
                result.Add(tmp.ToArray());
            }

            //按照实体出现次数排序
            List <string> keys = new List <string>();

            foreach (DictionaryEntry de in result)
            {
                keys.Add(((Entity)de.Key).Value.ToString());
            }
            return(keys);
        }
Ejemplo n.º 11
0
        private static string GetKnownledgeResult(string[] keywords)
        {
            // 1.获取相应文档集合
            IntTable <long, double> docsTable = new IntTable <long, double>();

            foreach (var keyword in keywords)
            {
                object obj = KnowledgeIndexData[keyword];
                if (obj == null)
                {
                    continue;
                }
                List <Iveely.STSdb4.Data.Slots <long, double> > dataIndexs = (List <Iveely.STSdb4.Data.Slots <long, double> >)obj;
                if (dataIndexs != null)
                {
                    foreach (var slotse in dataIndexs)
                    {
                        docsTable.Add(slotse.Slot0, slotse.Slot1);
                    }
                }
            }

            // 2.文档排序
            List <long> sortedList = new List <long>();
            ArrayList   list       = new ArrayList(docsTable.Values);

            list.Sort();
            list.Reverse();
            int maxCount = 5;

            foreach (double svalue in list)
            {
                maxCount--;
                if (maxCount < 0)
                {
                    break;
                }
                IDictionaryEnumerator ide = docsTable.GetEnumerator();
                while (ide.MoveNext())
                {
                    if (Math.Abs((double)ide.Value - svalue) < 0.00000001)
                    {
                        sortedList.Add((long)ide.Key);
                    }
                }
            }

            // 3.提取文档数据
            List <KnowlegeIndex.KnowledgeEntity> doclist = new List <KnowlegeIndex.KnowledgeEntity>();
            int            cou    = 1;
            HashSet <long> filter = new HashSet <long>();

            foreach (long url in sortedList)
            {
                KnowlegeIndex.KnowledgeEntity entity = KnowledgeDataTable.Find(url);
                if (entity != null && !filter.Contains(entity.Id) && cou > 0)
                {
                    doclist.Add(entity);
                    filter.Add(entity.Id);
                    cou--;
                }
            }

            // 4.截取摘要返回
            string result = "";

            foreach (var page in doclist)
            {
                result += BuildTextResult(ChangeColor(keywords, "[知]" + page.QuestionDesc), "答案:" + page.Answer,
                                          page.RefUrl, page.EffectTime, "");
            }

            return(result);
        }
Ejemplo n.º 12
0
        private static string GetTextResult(string[] keywords)
        {
            try
            {
                // 1.获取相应文档集合
                IntTable <string, double> docsTable = new IntTable <string, double>();
                foreach (var keyword in keywords)
                {
                    object obj = TextIndexData[keyword];
                    if (obj == null)
                    {
                        continue;
                    }
                    List <Iveely.STSdb4.Data.Slots <string, double> > textIndexs = (List <Iveely.STSdb4.Data.Slots <string, double> >)obj;
                    if (textIndexs != null)
                    {
                        foreach (var slotse in textIndexs)
                        {
                            docsTable.Add(slotse.Slot0, slotse.Slot1);
                        }
                    }
                }

                // 2.文档排序
                List <string> sortedList = new List <string>();
                ArrayList     list       = new ArrayList(docsTable.Values);
                list.Sort();
                list.Reverse();
                int maxCount = 9;
                foreach (double svalue in list)
                {
                    maxCount--;
                    if (maxCount < 0)
                    {
                        break;
                    }
                    IDictionaryEnumerator ide = docsTable.GetEnumerator();
                    while (ide.MoveNext())
                    {
                        if (Math.Abs((double)ide.Value - svalue) < 0.00000001)
                        {
                            sortedList.Add(ide.Key.ToString());
                        }
                    }
                }

                // 3.提取文档数据
                List <Crawler.Page> doclist = new List <Crawler.Page>();
                HashSet <string>    filter  = new HashSet <string>();
                foreach (string url in sortedList)
                {
                    Crawler.Page page = SearchDataTable.Find(url);
                    if (page != null && !filter.Contains(page.Url))
                    {
                        doclist.Add(page);
                        filter.Add(page.Url);
                    }
                }

                // 4.截取摘要返回
                string result = "";
                foreach (var page in doclist)
                {
                    result += BuildTextResult(ChangeColor(keywords, page.Title), CheckContent(page.Content, keywords),
                                              page.Url, page.Timestamp, "");
                }

                return(result);
            }
            catch (Exception exception)
            {
                return(exception.ToString());
            }
        }