public override void ProcessWords(string[] words, object docId) { var frequency = new IntTable <string, int>(); frequency.Add(words); foreach (DictionaryEntry de in frequency) { if (de.Value != null) { Table[de.Key.ToString()][docId.ToString()] = double.Parse(de.Value.ToString()); } } }
private void InsertIndex(long id, string text) { string dataPath = "Baike\\Baike_question_index.db4"; var frequency = new IntTable <string, int>(); string[] results = _segment.Split(text); if (results.Length < 1) { return; } frequency.Add(results); foreach (DictionaryEntry de in frequency) { KeywordIndex keywordIndex = new KeywordIndex(); keywordIndex.Keyword = de.Key.ToString(); keywordIndex.Weight = int.Parse(de.Value.ToString()) * 1.0 / results.Length; keywordIndex.Id = id; Indexs.Add(keywordIndex); } if (Indexs.Count > 0) { using (IStorageEngine engine = STSdb.FromFile(dataPath)) { ITable <string, List <Slots <long, double> > > table = engine.OpenXTable <string, List <Slots <long, double> > >("WebPage"); foreach (var keywordIndex in Indexs) { // 如果包含则追加 List <Slots <long, double> > list = table.Find(keywordIndex.Keyword); if (list != null && list.Count > 0) { Slots <long, double> slot = new Slots <long, double>(keywordIndex.Id, keywordIndex.Weight); list.Add(slot); } // 否则新增 else { list = new List <Slots <long, double> >(); Slots <long, double> slot = new Slots <long, double>(keywordIndex.Id, keywordIndex.Weight); list.Add(slot); table[keywordIndex.Keyword] = list; } } engine.Commit(); } Indexs.Clear(); } }
/// <summary> /// 分析原始网页数据 /// </summary> /// <param name="folder"></param> public void AnalysisData(string folder, string filePath) { // 如果有多余的,需要存放到数据库 lock (PageObj) { if (!Directory.Exists(folder)) { Directory.CreateDirectory(folder); } string fileFlag = filePath; using (IStorageEngine engine = STSdb.FromFile(fileFlag)) { // 插入数据 ITable <string, Crawler.Page> table = engine.OpenXTable <string, Crawler.Page>("WebPage"); foreach (var keyValuePair in table) { Crawler.Page page = (Crawler.Page)keyValuePair.Value; if (page != null && page.Content.Trim().Length > 0) { Console.WriteLine(page.Url); var frequency = new IntTable <string, int>(); string[] results = segment.Split(page.Title + page.Title + page.Content); if (results.Length < 1) { continue; } frequency.Add(results); foreach (DictionaryEntry de in frequency) { TextIndex textIndex = new TextIndex(); textIndex.Keyword = de.Key.ToString(); textIndex.Weight = int.Parse(de.Value.ToString()) * 1.0 / results.Length; textIndex.Url = page.Url; indexs.Add(textIndex); SaveIndex(ref indexs, folder, false); } pages.Add(page); SaveContent(ref pages, folder, false); } } } } SaveIndex(ref indexs, folder, true); SaveContent(ref pages, folder, true); }
static void Main(string[] args) { IntTable intTable = new IntTable(9); int[] numbers = new int[] { 3, -32, 10, 32, 0, 5, 2, 6, 9, 3 }; for (int i = 0; i < numbers.Length; i++) { intTable.Set(i, numbers[i]); } intTable.Sort(); for (int i = 0; i < intTable.Length; i++) { Console.WriteLine(intTable.Get(i)); } }
/// <summary> /// 根据关键字集获取它所在地文档以及在文档中的频率 /// <example> /// 例如传入关键字:“北京 地铁” /// 会将二者对应的文档按照同时出现的情况进行合并 /// </example> /// </summary> /// <returns> 返回按照频率的集合 </returns> public List <TValue> FindCommonDocumentByKeys(TKey[] keys, int maxCount) { IntTable <string, int> table = new IntTable <string, int>(); foreach (TKey key in keys) { List <TKey> temp = FindDocumentByKey(key, false); if (temp != null && temp.Count > 0) { foreach (var t in temp) { table.Add(t.ToString(), 1, true); } } } if (table.Count < 1) { return(null); } List <TValue> result = new List <TValue>(); ArrayList list = new ArrayList(table.Values); list.Sort(); list.Reverse(); for (int i = 0; i < maxCount && i < list.Count; i++) { IDictionaryEnumerator ide = table.GetEnumerator(); while (ide.MoveNext()) { // TValue k= (TValue)ide.Key; if (ide.Value == list[i]) // && int.Parse(list[i].ToString()) == keys.Length) { result.Add((TValue)ide.Key); } } } return(result); }
public InitialStateProbability() { Table = new IntTable<string, double>(); }
public InitialStateProbability() { Table = new IntTable <string, double>(); }
private static string GetTextResult(string[] keywords) { try { // 1.获取相应文档集合 IntTable<string, double> docsTable = new IntTable<string, double>(); foreach (var keyword in keywords) { object obj = TextIndexData[keyword]; if (obj == null) { continue; } List<Slots<string, double>> textIndexs = (List<Slots<string, double>>)obj; if (textIndexs != null) { foreach (var slotse in textIndexs) { docsTable.Add(slotse.Slot0, slotse.Slot1); } } } // 2.文档排序 List<string> sortedList = new List<string>(); ArrayList list = new ArrayList(docsTable.Values); list.Sort(); list.Reverse(); int maxCount = 9; foreach (double svalue in list) { maxCount--; if (maxCount < 0) { break; } IDictionaryEnumerator ide = docsTable.GetEnumerator(); while (ide.MoveNext()) { if (Math.Abs((double)ide.Value - svalue) < 0.00000001) { sortedList.Add(ide.Key.ToString()); } } } // 3.提取文档数据 List<Crawler.Page> doclist = new List<Crawler.Page>(); HashSet<string> filter = new HashSet<string>(); foreach (string url in sortedList) { Crawler.Page page = _searchDataTable.Find(url); if (page != null && !filter.Contains(page.Url)) { doclist.Add(page); filter.Add(page.Url); } } // 4.截取摘要返回 string result = ""; foreach (var page in doclist) { result += BuildTextResult(ChangeColor(keywords, page.Title), CheckContent(page.Content, keywords), page.Url, page.Timestamp, ""); } return result; } catch (Exception exception) { return exception.ToString(); } }
private static string GetKnownledgeResult(string[] keywords) { // 1.获取相应文档集合 IntTable<long, double> docsTable = new IntTable<long, double>(); foreach (var keyword in keywords) { object obj = KnowledgeIndexData[keyword]; if (obj == null) { continue; } List<Slots<long, double>> dataIndexs = (List<Slots<long, double>>)obj; if (dataIndexs != null) { foreach (var slotse in dataIndexs) { docsTable.Add(slotse.Slot0, slotse.Slot1); } } } // 2.文档排序 List<long> sortedList = new List<long>(); ArrayList list = new ArrayList(docsTable.Values); list.Sort(); list.Reverse(); int maxCount = 5; foreach (double svalue in list) { maxCount--; if (maxCount < 0) { break; } IDictionaryEnumerator ide = docsTable.GetEnumerator(); while (ide.MoveNext()) { if (Math.Abs((double)ide.Value - svalue) < 0.00000001) { sortedList.Add((long)ide.Key); } } } // 3.提取文档数据 List<KnowlegeIndex.KnowledgeEntity> doclist = new List<KnowlegeIndex.KnowledgeEntity>(); int cou = 1; HashSet<long> filter = new HashSet<long>(); foreach (long url in sortedList) { KnowlegeIndex.KnowledgeEntity entity = _knowledgeDataTable.Find(url); if (entity != null && !filter.Contains(entity.Id) && cou > 0) { doclist.Add(entity); filter.Add(entity.Id); cou--; } } // 4.截取摘要返回 string result = ""; foreach (var page in doclist) { result += BuildTextResult(ChangeColor(keywords, "[知]"+page.QuestionDesc), "答案:" + page.Answer, page.RefUrl, page.EffectTime, ""); } return result; }
public List <string> Query(Common.Sentence forQueryType, string[] times, string[] places, string[] whoms, string[] events) { //构建时间 Entity[] entityTimes = BuildEntity(Common.Sentence.Time, times); //构建地点 Entity[] entityPlaces = BuildEntity(Common.Sentence.Place, places); //构建人物 Entity[] entityWhoms = BuildEntity(Common.Sentence.Whom, whoms); //构建事件 Entity[] entityEvent = BuildEntity(Common.Sentence.Event, events); IntTable <Entity, int> result = new IntTable <Entity, int>(); //时间-地点相关 List <Entity> tmp = FindRelation(entityTimes, entityPlaces, forQueryType); if (tmp != null && tmp.Count > 0) { result.Add(tmp.ToArray()); } //时间-人物相关 tmp = FindRelation(entityTimes, entityWhoms, forQueryType); if (tmp != null && tmp.Count > 0) { result.Add(tmp.ToArray()); } //时间-事件相关 tmp = FindRelation(entityTimes, entityEvent, forQueryType); if (tmp != null && tmp.Count > 0) { result.Add(tmp.ToArray()); } //地点-人物相关 tmp = FindRelation(entityPlaces, entityWhoms, forQueryType); if (tmp != null && tmp.Count > 0) { result.Add(tmp.ToArray()); } //地点-事件相关 tmp = FindRelation(entityPlaces, entityEvent, forQueryType); if (tmp != null && tmp.Count > 0) { result.Add(tmp.ToArray()); } //人物-事件相关 tmp = FindRelation(entityWhoms, entityEvent, forQueryType); if (tmp != null && tmp.Count > 0) { result.Add(tmp.ToArray()); } //按照实体出现次数排序 List <string> keys = new List <string>(); foreach (DictionaryEntry de in result) { keys.Add(((Entity)de.Key).Value.ToString()); } return(keys); }
private static string GetKnownledgeResult(string[] keywords) { // 1.获取相应文档集合 IntTable <long, double> docsTable = new IntTable <long, double>(); foreach (var keyword in keywords) { object obj = KnowledgeIndexData[keyword]; if (obj == null) { continue; } List <Iveely.STSdb4.Data.Slots <long, double> > dataIndexs = (List <Iveely.STSdb4.Data.Slots <long, double> >)obj; if (dataIndexs != null) { foreach (var slotse in dataIndexs) { docsTable.Add(slotse.Slot0, slotse.Slot1); } } } // 2.文档排序 List <long> sortedList = new List <long>(); ArrayList list = new ArrayList(docsTable.Values); list.Sort(); list.Reverse(); int maxCount = 5; foreach (double svalue in list) { maxCount--; if (maxCount < 0) { break; } IDictionaryEnumerator ide = docsTable.GetEnumerator(); while (ide.MoveNext()) { if (Math.Abs((double)ide.Value - svalue) < 0.00000001) { sortedList.Add((long)ide.Key); } } } // 3.提取文档数据 List <KnowlegeIndex.KnowledgeEntity> doclist = new List <KnowlegeIndex.KnowledgeEntity>(); int cou = 1; HashSet <long> filter = new HashSet <long>(); foreach (long url in sortedList) { KnowlegeIndex.KnowledgeEntity entity = KnowledgeDataTable.Find(url); if (entity != null && !filter.Contains(entity.Id) && cou > 0) { doclist.Add(entity); filter.Add(entity.Id); cou--; } } // 4.截取摘要返回 string result = ""; foreach (var page in doclist) { result += BuildTextResult(ChangeColor(keywords, "[知]" + page.QuestionDesc), "答案:" + page.Answer, page.RefUrl, page.EffectTime, ""); } return(result); }
private static string GetTextResult(string[] keywords) { try { // 1.获取相应文档集合 IntTable <string, double> docsTable = new IntTable <string, double>(); foreach (var keyword in keywords) { object obj = TextIndexData[keyword]; if (obj == null) { continue; } List <Iveely.STSdb4.Data.Slots <string, double> > textIndexs = (List <Iveely.STSdb4.Data.Slots <string, double> >)obj; if (textIndexs != null) { foreach (var slotse in textIndexs) { docsTable.Add(slotse.Slot0, slotse.Slot1); } } } // 2.文档排序 List <string> sortedList = new List <string>(); ArrayList list = new ArrayList(docsTable.Values); list.Sort(); list.Reverse(); int maxCount = 9; foreach (double svalue in list) { maxCount--; if (maxCount < 0) { break; } IDictionaryEnumerator ide = docsTable.GetEnumerator(); while (ide.MoveNext()) { if (Math.Abs((double)ide.Value - svalue) < 0.00000001) { sortedList.Add(ide.Key.ToString()); } } } // 3.提取文档数据 List <Crawler.Page> doclist = new List <Crawler.Page>(); HashSet <string> filter = new HashSet <string>(); foreach (string url in sortedList) { Crawler.Page page = SearchDataTable.Find(url); if (page != null && !filter.Contains(page.Url)) { doclist.Add(page); filter.Add(page.Url); } } // 4.截取摘要返回 string result = ""; foreach (var page in doclist) { result += BuildTextResult(ChangeColor(keywords, page.Title), CheckContent(page.Content, keywords), page.Url, page.Timestamp, ""); } return(result); } catch (Exception exception) { return(exception.ToString()); } }