示例#1
0
 public void Start()
 {
     try
     {
         using (DBDocumentStorage ds = new DBDocumentStorage(_config.DBPath))
         {
             Threads = new Dictionary<Thread, ThreadContext>();
             Dictionary<Guid, string> docs = ds.GetDocumentsList();
             int c = 0;
             Dictionary<Guid, string> ThrList = new Dictionary<Guid, string>();
             foreach(var doc in docs)
             {
                 c++;
                 if(c>= docs.Count/ ThreadCount)
                 {
                     Thread th = new Thread(Indexate);
                     IndexatorThreadContext tc = new IndexatorThreadContext(ThrList, th, Thread.CurrentThread, this);
                     th.IsBackground = true;
                     th.Start(tc);
                     Threads.Add(th, tc);
                     ThrList = new Dictionary<Guid, string>();
                     c = 0;
                 }
                 ThrList.Add(doc.Key, doc.Value);
             }
             Log("Threads started", "", LogType.Message);
         }
     }
     catch (Exception ex)
     {
         Log(string.Format("{0} \r\n{1}", ex.Message, ex.StackTrace), "SpydeeCore", LogType.Error, "CoreConstructor");
     }
 }
示例#2
0
 private void Indexate(object context)
 {
     using (IndexatorThreadContext tc = (IndexatorThreadContext)context)
     {
         var kernel = tc.kernel;
         DBDocumentStorage dbStorage = new DBDocumentStorage(kernel.DBPath);
         foreach (var dcinfo in tc.DocList)
         {
             DBDocument d = dbStorage.GetDocumentByID(dcinfo.Key);
             string t = d.Text;
             //Log(string.Format("Doc [{0}, {1}] scanned.", d.ID, d.URL), "", LogType.Message);
         }
     }
 }
示例#3
0
 private void LoadFromDB(string ConnectionString)
 {
     DBstorage = new DBDocumentStorage(ConnectionString);
     docs = DBstorage.GetDocuments();
     UpdateDocList();
 }
示例#4
0
        public void SiteScan(object tc)
        {
            //ThreadContext mtc = (ThreadContext)tc;
            using (CrawlerThreadContext mtc = (CrawlerThreadContext)tc)
            {

                CrawlerCore kernel = (CrawlerCore)mtc.kernel;
                DBDocumentStorage dbStorage = new DBDocumentStorage(kernel.DBPath);
                try
                {
                    Log("Соединение с базой. Строка соединения: [" + kernel.DBPath + "] ...", "Core", LogType.Message);
                    string msg = string.Format("Поток ;{0} стартован, сканирование сайта '{1}'", Thread.CurrentThread.ManagedThreadId, mtc.SiteQueue.GetNextUnscanned().Adress);
                    kernel.Log(msg, "Core", LogType.Message);
                    if (kernel.UseRobots)
                    {
                        try
                        {
                            Uri robotsUrl = new Uri(mtc.Site.Adress, "robots.txt");
                            WebResponse wrr = kernel.MakeRequest(robotsUrl);
                            Stream ds = wrr.GetResponseStream();
                            Encoding fEncoding = Encoding.Default;
                            StreamReader r = new StreamReader(ds, fEncoding);
                            string content = r.ReadToEnd();
                            string hdr = HTMLUtils.GetHeaderAsString(wrr);
                            DBDocument d = dbStorage.CreateDocument(robotsUrl.OriginalString, content, hdr);
                            dbStorage.AddDocument(d);
                            //HtmlDocument doc = new HtmlDocument();
                            //doc.LoadHtml(robots);
                        }
                        catch (Exception e)
                        {
                            string pgmsg = string.Format("Robots.txt ОШИБКА: {0}", e.Message);
                            kernel.Log(pgmsg, "Core", LogType.Error);
                            // kernel.Log(e.StackTrace, "Core", LogType.Error);
                        }
                    }
                    while (mtc.SiteQueue.HasUnscanned && !mtc.Stop)
                    {
                        //получение адреса из очереди
                        Page pg = mtc.SiteQueue.GetNextUnscanned();
                        //запрос
                        try
                        {
                            WebResponse w = kernel.MakeRequest(pg.Adress);
                            System.Net.Mime.ContentType ct = new System.Net.Mime.ContentType(w.ContentType);
                            Encoding fEncoding = Encoding.Default;
                            if (!string.IsNullOrEmpty(ct.CharSet))
                                fEncoding = Encoding.GetEncoding(ct.CharSet);
                            Stream dataStream = w.GetResponseStream();
                            StreamReader reader = new StreamReader(dataStream, fEncoding);
                            string content = reader.ReadToEnd();
                            string scanmsg = string.Format("Сканирую страницу: {1}", Thread.CurrentThread.ManagedThreadId, pg.Adress);
                            kernel.Log(scanmsg, "Core", LogType.Message);
                            List<FoundText> ft = null;
                            if (ct.MediaType.StartsWith("text", StringComparison.InvariantCultureIgnoreCase))
                            {
                                HtmlDocument doc = new HtmlDocument();
                                doc.LoadHtml(content);
                                //получение ссылок на другие страницы
                                SiteList links = GetLinksFromHTMLDocument(doc);
                                foreach (string link in links)
                                {
                                    string[] parts = link.Split('?');
                                    parts = parts[0].Split('#');
                                    mtc.SiteQueue.Add(parts[0]);
                                }

                                //получение "чистого" текста
                                string plainText = HTMLUtils.GetPlainText(doc);
                                ft = HTMLUtils.GetMatches(plainText, kernel.SearchWords.ToArray());
                            }
                            if ((ft != null) && (ft.Count > 0))
                            {
                                string hdr = HTMLUtils.GetHeaderAsString(w);
                                //запись в базу
                                DBDocument d = dbStorage.CreateDocument(pg.Adress.OriginalString, content, hdr);
                                dbStorage.AddDocument(d);
                            }
                            //kernel.Log(string.Format("Document added: {0}", pg.Adress), "Core", LogType.Message);
                            mtc.SiteQueue[pg] = true;
                            Thread.Sleep(mtc.kernel.SiteScanDelay);
                        }
                        catch (Exception e)
                        {
                            string pgmsg = string.Format("Страница: '{0}' Ошибка: {1}", pg.Adress, e.Message);
                            kernel.Log(pgmsg, "Core", LogType.Error);
                            // kernel.Log(e.StackTrace, "Core", LogType.Error);
                        }
                    }
                }
                catch (Exception e)
                {
                    kernel.Log(e.Message, "Core", LogType.Error);
                    kernel.Log(e.StackTrace, "Core", LogType.Error);
                }
                finally
                {
                    dbStorage.Dispose();
                }
            }
        }
示例#5
0
        private void FilterDocList(object _df)
        {
            AddDocumentGridDelegate deleg = new AddDocumentGridDelegate(AddDocumentGrid);
            DocumentFilter df = (DocumentFilter)_df;
            if (df.Count > 0)
            {
                DBDocumentStorage ds = new DBDocumentStorage(DBstorage.DB.ConnectionString);
                List<Guid> docids = ds.GetDocumentsIDs();
                docs.Clear();
                int i = 0;
                int scancount = 0;
                foreach (Guid docid in docids)
                {
                    if (StopSearch)
                        break;
                    bool flag = false;
                    DBDocument d = ds.GetDocumentByID(docid);
                    try
                    {
                        string ct = d.LastVersion.HeadersCollection[HttpResponseHeader.ContentType];
                        if (ct.StartsWith("text"))
                        {
                            string src = HTMLUtils.GetPlainText(d.LastVersion.DataAsText);

                            /*foreach (string word in df)
                            {
                                if (src.IndexOf(word) > -1)
                                {
                                    flag = true;
                                    if (foundword != "")
                                        foundword = foundword + ", ";
                                    foundword += word;
                                    //break;
                                }
                            }*/
                            List<FoundText>  matches = HTMLUtils.GetMatches(src, df.ToArray());
                            if (matches.Count >0 )
                            {
                                docs.Add(d.ID, d.URL);
                                string FoundWords = "";
                                foreach(FoundText m in matches)
                                {
                                    if (FoundWords != "")
                                        FoundWords = FoundWords+ ", " ;
                                    FoundWords = FoundWords + string.Format("({0},{1}) {2}", m.Index, m.Length, src.Substring(m.Index, m.Length));
                                }
                                dgv.Invoke(deleg, i++, d.ID, d.URL, FoundWords);
                            }
                        }
                    }
                    catch (Exception ex) { Log(ex); }
                    finally
                    {
                        scancount++;
                        string message = string.Format("Scanned {0}/{1} documents. Found {2} documents.", scancount, docids.Count, i);

                        Log(message);
                        d.Dispose();
                    }
                    }
            }
        }