/// <summary> /// 分析原始网页数据 /// </summary> /// <param name="folder"></param> public void AnalysisData(string folder, string filePath) { // 如果有多余的,需要存放到数据库 lock (PageObj) { if (!Directory.Exists(folder)) { Directory.CreateDirectory(folder); } string fileFlag = filePath; using (IStorageEngine engine = STSdb.FromFile(fileFlag)) { // 插入数据 ITable <string, Crawler.Page> table = engine.OpenXTable <string, Crawler.Page>("WebPage"); foreach (var keyValuePair in table) { Crawler.Page page = (Crawler.Page)keyValuePair.Value; if (page != null && page.Content.Trim().Length > 0) { Console.WriteLine(page.Url); var frequency = new IntTable <string, int>(); string[] results = segment.Split(page.Title + page.Title + page.Content); if (results.Length < 1) { continue; } frequency.Add(results); foreach (DictionaryEntry de in frequency) { TextIndex textIndex = new TextIndex(); textIndex.Keyword = de.Key.ToString(); textIndex.Weight = int.Parse(de.Value.ToString()) * 1.0 / results.Length; textIndex.Url = page.Url; indexs.Add(textIndex); SaveIndex(ref indexs, folder, false); } pages.Add(page); SaveContent(ref pages, folder, false); } } } } SaveIndex(ref indexs, folder, true); SaveContent(ref pages, folder, true); }
private static string GetTextResult(string[] keywords) { try { // 1.获取相应文档集合 IntTable <string, double> docsTable = new IntTable <string, double>(); foreach (var keyword in keywords) { object obj = TextIndexData[keyword]; if (obj == null) { continue; } List <Iveely.STSdb4.Data.Slots <string, double> > textIndexs = (List <Iveely.STSdb4.Data.Slots <string, double> >)obj; if (textIndexs != null) { foreach (var slotse in textIndexs) { docsTable.Add(slotse.Slot0, slotse.Slot1); } } } // 2.文档排序 List <string> sortedList = new List <string>(); ArrayList list = new ArrayList(docsTable.Values); list.Sort(); list.Reverse(); int maxCount = 9; foreach (double svalue in list) { maxCount--; if (maxCount < 0) { break; } IDictionaryEnumerator ide = docsTable.GetEnumerator(); while (ide.MoveNext()) { if (Math.Abs((double)ide.Value - svalue) < 0.00000001) { sortedList.Add(ide.Key.ToString()); } } } // 3.提取文档数据 List <Crawler.Page> doclist = new List <Crawler.Page>(); HashSet <string> filter = new HashSet <string>(); foreach (string url in sortedList) { Crawler.Page page = SearchDataTable.Find(url); if (page != null && !filter.Contains(page.Url)) { doclist.Add(page); filter.Add(page.Url); } } // 4.截取摘要返回 string result = ""; foreach (var page in doclist) { result += BuildTextResult(ChangeColor(keywords, page.Title), CheckContent(page.Content, keywords), page.Url, page.Timestamp, ""); } return(result); } catch (Exception exception) { return(exception.ToString()); } }