public void StartSearcher() { if (File.Exists(_textIndexFile) && TextFragment == null) { TextFragment = Serializer.DeserializeFromFile <InvertFragment>(_textIndexFile); } if (File.Exists(_relativeIndexFile) && RelativeTable == null) { RelativeTable = Serializer.DeserializeFromFile <DimensionTable <string, string, double> >(_relativeIndexFile); } int servicePort = int.Parse(GetRootFolder()) % 100; try { _searchPort += servicePort; Server server = new Server(Dns.GetHostName(), _searchPort, ProcessQuery); WriteToConsole(_searchPort + " is start search service."); server.Listen(); } catch (Exception exception) { WriteToConsole("Start Server Error." + exception); } }
public EntityManager() { _entityIndex = new Hashtable(); if (eeFragment == null) { eeFragment = new InvertFragment(); arFragment = new InvertFragment(); brFragment = new InvertFragment(); } }
private List <string> GetRelationList(InvertFragment fragment, string a, string b, string c) { List <string> list = fragment.FindCommonDocumentByKeys(new string[] { a, b }, 10); if (list != null && list.Count > 0) { return(list); } list = fragment.FindCommonDocumentByKeys(new string[] { b, c }, 10); if (list != null && list.Count > 0) { return(list); } list = fragment.FindCommonDocumentByKeys(new string[] { c, a }, 10); if (list != null && list.Count > 0) { return(list); } return(list); }
/// <summary> /// 主程序入口 /// </summary> /// <param name="args"></param> public override void Run(object[] args) { //1. 初始化 Init(args); DataStore = new LocalStore<Template.Question>(GetRootFolder() + "\\QuestionData.index", GetRootFolder() + "\\QuestionData", 100); //1.1 文本索引文件 _textIndexFile = GetRootFolder() + "\\InvertFragment.part"; if (File.Exists(_textIndexFile)) TextFragment = Serializer.DeserializeFromFile<InvertFragment>(_textIndexFile); if (TextFragment == null) { TextFragment = new InvertFragment(GetRootFolder()); } //1.2 相关索引文件 _relativeIndexFile = GetRootFolder() + "\\RelativeFragment.part"; if (File.Exists(_relativeIndexFile)) RelativeTable = Serializer.DeserializeFromFile<DimensionTable<string, string, double>>(_relativeIndexFile); if (RelativeTable == null) { RelativeTable = new DimensionTable<string, string, double>(); } Urls.Add("http://www.baike.com"); //StartSearcher(); Thread searchThread = new Thread(StartSearcher); searchThread.Start(); Thread.Sleep(10000); //2. 循环数据采集 while (Urls.Count > 0) { List<Page> pages = new List<Page>(); try { //2.1 爬虫开始运行 Crawler(ref pages); } catch (Exception exception) { Logger.Warn(exception); } //2.2 索引器开始运行 if (pages != null && pages.Count > 0) { try { Indexer(ref pages); } catch (Exception exception) { Logger.Warn(exception); } } //2.3 更新url try { Urls.AddRange(GetKeysByValueFromCache(false, 10, true)); } catch (Exception exception) { Logger.Warn(exception); } } }
public void StartSearcher() { if (File.Exists(_textIndexFile) && TextFragment == null) { TextFragment = Serializer.DeserializeFromFile<InvertFragment>(_textIndexFile); } if (File.Exists(_relativeIndexFile) && RelativeTable == null) { RelativeTable = Serializer.DeserializeFromFile<DimensionTable<string, string, double>>(_relativeIndexFile); } int servicePort = int.Parse(GetRootFolder()) % 100; try { _searchPort += servicePort; Server server = new Server(Dns.GetHostName(), _searchPort, ProcessQuery); WriteToConsole(_searchPort + " is start search service."); server.Listen(); } catch (Exception exception) { WriteToConsole("Start Server Error." + exception); } }
/// <summary> /// 索引程序入口 /// </summary> /// <param name="pages">网页信息集合</param> public void Indexer(ref List<Page> pages) { //自动分析网页表达的含义 //WriteToConsole(string.Format("开始自动分析网页表达的含义,共{0}条记录。", pages.Count)); List<Template.Question> questions = new List<Template.Question>(); const string delimiter = ".?。!\t?…●|\r\n])!"; foreach (Page page in pages) { string[] sentences = page.Content.Split(delimiter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); foreach (string sentence in sentences) { if (sentence.Length >= 5) { Template.Question result = Bot.GetInstance(GetRootFolder()) .BuildQuestion(sentence, page.Url, page.Title); if (result != null && result.Description != null && result.Description.Count > 0) { result.Content = sentence; questions.Add(result); } } } } pages.Clear(); //对表达的语义建议索引 // WriteToConsole(string.Format("对表达的语义建议索引,共{0}条记录。", questions.Count)); if (File.Exists(_textIndexFile) && TextFragment == null) { TextFragment = Serializer.DeserializeFromFile<InvertFragment>(_textIndexFile); } if (File.Exists(_relativeIndexFile) && RelativeTable == null) { RelativeTable = Serializer.DeserializeFromFile<DimensionTable<string, string, double>>(_relativeIndexFile); } if (questions.Any()) { foreach (Template.Question question in questions) { int id = question.GetHashCode(); question.Id = id; TextFragment.AddDocument(id, question.Content, false); foreach (var entity in question.Entity) { double oldValue = RelativeTable[entity.Item1][entity.Item2] == null ? 0 : RelativeTable[entity.Item1][entity.Item2]; RelativeTable[entity.Item1][entity.Item2] = oldValue + 0.0001; } DataStore.Write(question); } } questions.Clear(); Serializer.SerializeToFile(TextFragment, _textIndexFile); Serializer.SerializeToFile(RelativeTable, _relativeIndexFile); }
private List<string> GetRelationList(InvertFragment fragment, string a, string b, string c) { List<string> list = fragment.FindCommonDocumentByKeys(new string[] { a, b }, 10); if (list != null && list.Count > 0) { return list; } list = fragment.FindCommonDocumentByKeys(new string[] { b, c }, 10); if (list != null && list.Count > 0) { return list; } list = fragment.FindCommonDocumentByKeys(new string[] { c, a }, 10); if (list != null && list.Count > 0) { return list; } return list; }
/// <summary> /// 索引程序入口 /// </summary> /// <param name="pages">网页信息集合</param> public void Indexer(ref List <Page> pages) { //自动分析网页表达的含义 //WriteToConsole(string.Format("开始自动分析网页表达的含义,共{0}条记录。", pages.Count)); List <Template.Question> questions = new List <Template.Question>(); const string delimiter = ".?。!\t?…●|\r\n])!"; foreach (Page page in pages) { string[] sentences = page.Content.Split(delimiter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); foreach (string sentence in sentences) { if (sentence.Length >= 5) { Template.Question result = Bot.GetInstance(GetRootFolder()) .BuildQuestion(sentence, page.Url, page.Title); if (result != null && result.Description != null && result.Description.Count > 0) { result.Content = sentence; questions.Add(result); } } } } pages.Clear(); //对表达的语义建议索引 // WriteToConsole(string.Format("对表达的语义建议索引,共{0}条记录。", questions.Count)); if (File.Exists(_textIndexFile) && TextFragment == null) { TextFragment = Serializer.DeserializeFromFile <InvertFragment>(_textIndexFile); } if (File.Exists(_relativeIndexFile) && RelativeTable == null) { RelativeTable = Serializer.DeserializeFromFile <DimensionTable <string, string, double> >(_relativeIndexFile); } if (questions.Any()) { foreach (Template.Question question in questions) { int id = question.GetHashCode(); question.Id = id; TextFragment.AddDocument(id, question.Content, false); foreach (var entity in question.Entity) { double oldValue = RelativeTable[entity.Item1][entity.Item2] == null ? 0 : RelativeTable[entity.Item1][entity.Item2]; RelativeTable[entity.Item1][entity.Item2] = oldValue + 0.0001; } DataStore.Write(question); } } questions.Clear(); Serializer.SerializeToFile(TextFragment, _textIndexFile); Serializer.SerializeToFile(RelativeTable, _relativeIndexFile); }
/// <summary> /// 主程序入口 /// </summary> /// <param name="args"></param> public override void Run(object[] args) { //1. 初始化 Init(args); DataStore = new LocalStore <Template.Question>(GetRootFolder() + "\\QuestionData.index", GetRootFolder() + "\\QuestionData", 100); //1.1 文本索引文件 _textIndexFile = GetRootFolder() + "\\InvertFragment.part"; if (File.Exists(_textIndexFile)) { TextFragment = Serializer.DeserializeFromFile <InvertFragment>(_textIndexFile); } if (TextFragment == null) { TextFragment = new InvertFragment(GetRootFolder()); } //1.2 相关索引文件 _relativeIndexFile = GetRootFolder() + "\\RelativeFragment.part"; if (File.Exists(_relativeIndexFile)) { RelativeTable = Serializer.DeserializeFromFile <DimensionTable <string, string, double> >(_relativeIndexFile); } if (RelativeTable == null) { RelativeTable = new DimensionTable <string, string, double>(); } Urls.Add("http://www.baike.com"); //StartSearcher(); Thread searchThread = new Thread(StartSearcher); searchThread.Start(); Thread.Sleep(10000); //2. 循环数据采集 while (Urls.Count > 0) { List <Page> pages = new List <Page>(); try { //2.1 爬虫开始运行 Crawler(ref pages); } catch (Exception exception) { Logger.Warn(exception); } //2.2 索引器开始运行 if (pages != null && pages.Count > 0) { try { Indexer(ref pages); } catch (Exception exception) { Logger.Warn(exception); } } //2.3 更新url try { Urls.AddRange(GetKeysByValueFromCache(false, 10, true)); } catch (Exception exception) { Logger.Warn(exception); } } }