Beispiel #1
0
        public void StartSearcher()
        {
            if (File.Exists(_textIndexFile) && TextFragment == null)
            {
                TextFragment = Serializer.DeserializeFromFile <InvertFragment>(_textIndexFile);
            }
            if (File.Exists(_relativeIndexFile) && RelativeTable == null)
            {
                RelativeTable =
                    Serializer.DeserializeFromFile <DimensionTable <string, string, double> >(_relativeIndexFile);
            }
            int servicePort = int.Parse(GetRootFolder()) % 100;

            try
            {
                _searchPort += servicePort;
                Server server = new Server(Dns.GetHostName(), _searchPort, ProcessQuery);
                WriteToConsole(_searchPort + " is start search service.");
                server.Listen();
            }
            catch (Exception exception)
            {
                WriteToConsole("Start Server Error." + exception);
            }
        }
Beispiel #2
0
 public EntityManager()
 {
     _entityIndex = new Hashtable();
     if (eeFragment == null)
     {
         eeFragment = new InvertFragment();
         arFragment = new InvertFragment();
         brFragment = new InvertFragment();
     }
 }
Beispiel #3
0
            private List <string> GetRelationList(InvertFragment fragment, string a, string b, string c)
            {
                List <string> list = fragment.FindCommonDocumentByKeys(new string[] { a, b }, 10);

                if (list != null && list.Count > 0)
                {
                    return(list);
                }

                list = fragment.FindCommonDocumentByKeys(new string[] { b, c }, 10);
                if (list != null && list.Count > 0)
                {
                    return(list);
                }

                list = fragment.FindCommonDocumentByKeys(new string[] { c, a }, 10);
                if (list != null && list.Count > 0)
                {
                    return(list);
                }
                return(list);
            }
Beispiel #4
0
        /// <summary>
        /// 主程序入口
        /// </summary>
        /// <param name="args"></param>
        public override void Run(object[] args)
        {
            //1. 初始化
            Init(args);
            DataStore = new LocalStore<Template.Question>(GetRootFolder() + "\\QuestionData.index",
                GetRootFolder() + "\\QuestionData", 100);

            //1.1 文本索引文件
            _textIndexFile = GetRootFolder() + "\\InvertFragment.part";
            if (File.Exists(_textIndexFile))
                TextFragment = Serializer.DeserializeFromFile<InvertFragment>(_textIndexFile);
            if (TextFragment == null)
            {
                TextFragment = new InvertFragment(GetRootFolder());
            }

            //1.2 相关索引文件
            _relativeIndexFile = GetRootFolder() + "\\RelativeFragment.part";
            if (File.Exists(_relativeIndexFile))
                RelativeTable = Serializer.DeserializeFromFile<DimensionTable<string, string, double>>(_relativeIndexFile);
            if (RelativeTable == null)
            {
                RelativeTable = new DimensionTable<string, string, double>();
            }
            Urls.Add("http://www.baike.com");

            //StartSearcher();
            Thread searchThread = new Thread(StartSearcher);
            searchThread.Start();

            Thread.Sleep(10000);

            //2. 循环数据采集
            while (Urls.Count > 0)
            {
                List<Page> pages = new List<Page>();
                try
                {
                    //2.1 爬虫开始运行
                    Crawler(ref pages);
                }
                catch (Exception exception)
                {
                    Logger.Warn(exception);
                }

                //2.2 索引器开始运行
                if (pages != null && pages.Count > 0)
                {
                    try
                    {
                        Indexer(ref pages);
                    }
                    catch (Exception exception)
                    {
                        Logger.Warn(exception);
                    }

                }

                //2.3 更新url
                try
                {
                    Urls.AddRange(GetKeysByValueFromCache(false, 10, true));
                }
                catch (Exception exception)
                {
                    Logger.Warn(exception);
                }
            }
        }
Beispiel #5
0
 public void StartSearcher()
 {
     if (File.Exists(_textIndexFile) && TextFragment == null)
     {
         TextFragment = Serializer.DeserializeFromFile<InvertFragment>(_textIndexFile);
     }
     if (File.Exists(_relativeIndexFile) && RelativeTable == null)
     {
         RelativeTable =
             Serializer.DeserializeFromFile<DimensionTable<string, string, double>>(_relativeIndexFile);
     }
     int servicePort = int.Parse(GetRootFolder()) % 100;
     try
     {
         _searchPort += servicePort;
         Server server = new Server(Dns.GetHostName(), _searchPort, ProcessQuery);
         WriteToConsole(_searchPort + " is start search service.");
         server.Listen();
     }
     catch (Exception exception)
     {
         WriteToConsole("Start Server Error." + exception);
     }
 }
Beispiel #6
0
        /// <summary>
        /// 索引程序入口
        /// </summary>
        /// <param name="pages">网页信息集合</param>
        public void Indexer(ref List<Page> pages)
        {
            //自动分析网页表达的含义
            //WriteToConsole(string.Format("开始自动分析网页表达的含义,共{0}条记录。", pages.Count));
            List<Template.Question> questions = new List<Template.Question>();
            const string delimiter = ".?。!\t?…●|\r\n])!";

            foreach (Page page in pages)
            {
                string[] sentences = page.Content.Split(delimiter.ToCharArray(),
                    StringSplitOptions.RemoveEmptyEntries);
                foreach (string sentence in sentences)
                {
                    if (sentence.Length >= 5)
                    {
                        Template.Question result = Bot.GetInstance(GetRootFolder())
                            .BuildQuestion(sentence, page.Url, page.Title);
                        if (result != null && result.Description != null && result.Description.Count > 0)
                        {
                            result.Content = sentence;
                            questions.Add(result);
                        }
                    }
                }

            }
            pages.Clear();

            //对表达的语义建议索引
            // WriteToConsole(string.Format("对表达的语义建议索引,共{0}条记录。", questions.Count));

            if (File.Exists(_textIndexFile) && TextFragment == null)
            {
                TextFragment = Serializer.DeserializeFromFile<InvertFragment>(_textIndexFile);
            }

            if (File.Exists(_relativeIndexFile) && RelativeTable == null)
            {
                RelativeTable =
                    Serializer.DeserializeFromFile<DimensionTable<string, string, double>>(_relativeIndexFile);
            }


            if (questions.Any())
            {
                foreach (Template.Question question in questions)
                {
                    int id = question.GetHashCode();
                    question.Id = id;
                    TextFragment.AddDocument(id, question.Content, false);
                    foreach (var entity in question.Entity)
                    {
                        double oldValue = RelativeTable[entity.Item1][entity.Item2] == null
                            ? 0
                            : RelativeTable[entity.Item1][entity.Item2];
                        RelativeTable[entity.Item1][entity.Item2] = oldValue + 0.0001;
                    }
                    DataStore.Write(question);
                }
            }
            questions.Clear();
            Serializer.SerializeToFile(TextFragment, _textIndexFile);
            Serializer.SerializeToFile(RelativeTable, _relativeIndexFile);
        }
            private List<string> GetRelationList(InvertFragment fragment, string a, string b, string c)
            {
                List<string> list = fragment.FindCommonDocumentByKeys(new string[] { a, b }, 10);
                if (list != null && list.Count > 0)
                {
                    return list;
                }

                list = fragment.FindCommonDocumentByKeys(new string[] { b, c }, 10);
                if (list != null && list.Count > 0)
                {
                    return list;
                }

                list = fragment.FindCommonDocumentByKeys(new string[] { c, a }, 10);
                if (list != null && list.Count > 0)
                {
                    return list;
                }
                return list;
            }
 public EntityManager()
 {
     _entityIndex = new Hashtable();
     if (eeFragment == null)
     {
         eeFragment = new InvertFragment();
         arFragment = new InvertFragment();
         brFragment = new InvertFragment();
     }
 }
Beispiel #9
0
        /// <summary>
        /// 索引程序入口
        /// </summary>
        /// <param name="pages">网页信息集合</param>
        public void Indexer(ref List <Page> pages)
        {
            //自动分析网页表达的含义
            //WriteToConsole(string.Format("开始自动分析网页表达的含义,共{0}条记录。", pages.Count));
            List <Template.Question> questions = new List <Template.Question>();
            const string             delimiter = ".?。!\t?…●|\r\n])!";

            foreach (Page page in pages)
            {
                string[] sentences = page.Content.Split(delimiter.ToCharArray(),
                                                        StringSplitOptions.RemoveEmptyEntries);
                foreach (string sentence in sentences)
                {
                    if (sentence.Length >= 5)
                    {
                        Template.Question result = Bot.GetInstance(GetRootFolder())
                                                   .BuildQuestion(sentence, page.Url, page.Title);
                        if (result != null && result.Description != null && result.Description.Count > 0)
                        {
                            result.Content = sentence;
                            questions.Add(result);
                        }
                    }
                }
            }
            pages.Clear();

            //对表达的语义建议索引
            // WriteToConsole(string.Format("对表达的语义建议索引,共{0}条记录。", questions.Count));

            if (File.Exists(_textIndexFile) && TextFragment == null)
            {
                TextFragment = Serializer.DeserializeFromFile <InvertFragment>(_textIndexFile);
            }

            if (File.Exists(_relativeIndexFile) && RelativeTable == null)
            {
                RelativeTable =
                    Serializer.DeserializeFromFile <DimensionTable <string, string, double> >(_relativeIndexFile);
            }


            if (questions.Any())
            {
                foreach (Template.Question question in questions)
                {
                    int id = question.GetHashCode();
                    question.Id = id;
                    TextFragment.AddDocument(id, question.Content, false);
                    foreach (var entity in question.Entity)
                    {
                        double oldValue = RelativeTable[entity.Item1][entity.Item2] == null
                            ? 0
                            : RelativeTable[entity.Item1][entity.Item2];
                        RelativeTable[entity.Item1][entity.Item2] = oldValue + 0.0001;
                    }
                    DataStore.Write(question);
                }
            }
            questions.Clear();
            Serializer.SerializeToFile(TextFragment, _textIndexFile);
            Serializer.SerializeToFile(RelativeTable, _relativeIndexFile);
        }
Beispiel #10
0
        /// <summary>
        /// 主程序入口
        /// </summary>
        /// <param name="args"></param>
        public override void Run(object[] args)
        {
            //1. 初始化
            Init(args);
            DataStore = new LocalStore <Template.Question>(GetRootFolder() + "\\QuestionData.index",
                                                           GetRootFolder() + "\\QuestionData", 100);

            //1.1 文本索引文件
            _textIndexFile = GetRootFolder() + "\\InvertFragment.part";
            if (File.Exists(_textIndexFile))
            {
                TextFragment = Serializer.DeserializeFromFile <InvertFragment>(_textIndexFile);
            }
            if (TextFragment == null)
            {
                TextFragment = new InvertFragment(GetRootFolder());
            }

            //1.2 相关索引文件
            _relativeIndexFile = GetRootFolder() + "\\RelativeFragment.part";
            if (File.Exists(_relativeIndexFile))
            {
                RelativeTable = Serializer.DeserializeFromFile <DimensionTable <string, string, double> >(_relativeIndexFile);
            }
            if (RelativeTable == null)
            {
                RelativeTable = new DimensionTable <string, string, double>();
            }
            Urls.Add("http://www.baike.com");

            //StartSearcher();
            Thread searchThread = new Thread(StartSearcher);

            searchThread.Start();

            Thread.Sleep(10000);

            //2. 循环数据采集
            while (Urls.Count > 0)
            {
                List <Page> pages = new List <Page>();
                try
                {
                    //2.1 爬虫开始运行
                    Crawler(ref pages);
                }
                catch (Exception exception)
                {
                    Logger.Warn(exception);
                }

                //2.2 索引器开始运行
                if (pages != null && pages.Count > 0)
                {
                    try
                    {
                        Indexer(ref pages);
                    }
                    catch (Exception exception)
                    {
                        Logger.Warn(exception);
                    }
                }

                //2.3 更新url
                try
                {
                    Urls.AddRange(GetKeysByValueFromCache(false, 10, true));
                }
                catch (Exception exception)
                {
                    Logger.Warn(exception);
                }
            }
        }