Exemplo n.º 1
0
 /// <summary>
 /// 分析原始网页数据
 /// </summary>
 /// <param name="folder"></param>
 public void AnalysisData(string folder, string filePath)
 {
     // 如果有多余的,需要存放到数据库
     lock (PageObj)
     {
         if (!Directory.Exists(folder))
         {
             Directory.CreateDirectory(folder);
         }
         string fileFlag = filePath;
         using (IStorageEngine engine = STSdb.FromFile(fileFlag))
         {
             // 插入数据
             ITable <string, Crawler.Page> table = engine.OpenXTable <string, Crawler.Page>("WebPage");
             foreach (var keyValuePair in table)
             {
                 Crawler.Page page = (Crawler.Page)keyValuePair.Value;
                 if (page != null && page.Content.Trim().Length > 0)
                 {
                     Console.WriteLine(page.Url);
                     var      frequency = new IntTable <string, int>();
                     string[] results   = segment.Split(page.Title + page.Title + page.Content);
                     if (results.Length < 1)
                     {
                         continue;
                     }
                     frequency.Add(results);
                     foreach (DictionaryEntry de in frequency)
                     {
                         TextIndex textIndex = new TextIndex();
                         textIndex.Keyword = de.Key.ToString();
                         textIndex.Weight  = int.Parse(de.Value.ToString()) * 1.0 / results.Length;
                         textIndex.Url     = page.Url;
                         indexs.Add(textIndex);
                         SaveIndex(ref indexs, folder, false);
                     }
                     pages.Add(page);
                     SaveContent(ref pages, folder, false);
                 }
             }
         }
     }
     SaveIndex(ref indexs, folder, true);
     SaveContent(ref pages, folder, true);
 }
Exemplo n.º 2
0
        private static string GetTextResult(string[] keywords)
        {
            try
            {
                // 1.获取相应文档集合
                IntTable <string, double> docsTable = new IntTable <string, double>();
                foreach (var keyword in keywords)
                {
                    object obj = TextIndexData[keyword];
                    if (obj == null)
                    {
                        continue;
                    }
                    List <Iveely.STSdb4.Data.Slots <string, double> > textIndexs = (List <Iveely.STSdb4.Data.Slots <string, double> >)obj;
                    if (textIndexs != null)
                    {
                        foreach (var slotse in textIndexs)
                        {
                            docsTable.Add(slotse.Slot0, slotse.Slot1);
                        }
                    }
                }

                // 2.文档排序
                List <string> sortedList = new List <string>();
                ArrayList     list       = new ArrayList(docsTable.Values);
                list.Sort();
                list.Reverse();
                int maxCount = 9;
                foreach (double svalue in list)
                {
                    maxCount--;
                    if (maxCount < 0)
                    {
                        break;
                    }
                    IDictionaryEnumerator ide = docsTable.GetEnumerator();
                    while (ide.MoveNext())
                    {
                        if (Math.Abs((double)ide.Value - svalue) < 0.00000001)
                        {
                            sortedList.Add(ide.Key.ToString());
                        }
                    }
                }

                // 3.提取文档数据
                List <Crawler.Page> doclist = new List <Crawler.Page>();
                HashSet <string>    filter  = new HashSet <string>();
                foreach (string url in sortedList)
                {
                    Crawler.Page page = SearchDataTable.Find(url);
                    if (page != null && !filter.Contains(page.Url))
                    {
                        doclist.Add(page);
                        filter.Add(page.Url);
                    }
                }

                // 4.截取摘要返回
                string result = "";
                foreach (var page in doclist)
                {
                    result += BuildTextResult(ChangeColor(keywords, page.Title), CheckContent(page.Content, keywords),
                                              page.Url, page.Timestamp, "");
                }

                return(result);
            }
            catch (Exception exception)
            {
                return(exception.ToString());
            }
        }