public void Start() { try { using (DBDocumentStorage ds = new DBDocumentStorage(_config.DBPath)) { Threads = new Dictionary<Thread, ThreadContext>(); Dictionary<Guid, string> docs = ds.GetDocumentsList(); int c = 0; Dictionary<Guid, string> ThrList = new Dictionary<Guid, string>(); foreach(var doc in docs) { c++; if(c>= docs.Count/ ThreadCount) { Thread th = new Thread(Indexate); IndexatorThreadContext tc = new IndexatorThreadContext(ThrList, th, Thread.CurrentThread, this); th.IsBackground = true; th.Start(tc); Threads.Add(th, tc); ThrList = new Dictionary<Guid, string>(); c = 0; } ThrList.Add(doc.Key, doc.Value); } Log("Threads started", "", LogType.Message); } } catch (Exception ex) { Log(string.Format("{0} \r\n{1}", ex.Message, ex.StackTrace), "SpydeeCore", LogType.Error, "CoreConstructor"); } }
private void Indexate(object context) { using (IndexatorThreadContext tc = (IndexatorThreadContext)context) { var kernel = tc.kernel; DBDocumentStorage dbStorage = new DBDocumentStorage(kernel.DBPath); foreach (var dcinfo in tc.DocList) { DBDocument d = dbStorage.GetDocumentByID(dcinfo.Key); string t = d.Text; //Log(string.Format("Doc [{0}, {1}] scanned.", d.ID, d.URL), "", LogType.Message); } } }
private void LoadFromDB(string ConnectionString) { DBstorage = new DBDocumentStorage(ConnectionString); docs = DBstorage.GetDocuments(); UpdateDocList(); }
public void SiteScan(object tc) { //ThreadContext mtc = (ThreadContext)tc; using (CrawlerThreadContext mtc = (CrawlerThreadContext)tc) { CrawlerCore kernel = (CrawlerCore)mtc.kernel; DBDocumentStorage dbStorage = new DBDocumentStorage(kernel.DBPath); try { Log("Соединение с базой. Строка соединения: [" + kernel.DBPath + "] ...", "Core", LogType.Message); string msg = string.Format("Поток ;{0} стартован, сканирование сайта '{1}'", Thread.CurrentThread.ManagedThreadId, mtc.SiteQueue.GetNextUnscanned().Adress); kernel.Log(msg, "Core", LogType.Message); if (kernel.UseRobots) { try { Uri robotsUrl = new Uri(mtc.Site.Adress, "robots.txt"); WebResponse wrr = kernel.MakeRequest(robotsUrl); Stream ds = wrr.GetResponseStream(); Encoding fEncoding = Encoding.Default; StreamReader r = new StreamReader(ds, fEncoding); string content = r.ReadToEnd(); string hdr = HTMLUtils.GetHeaderAsString(wrr); DBDocument d = dbStorage.CreateDocument(robotsUrl.OriginalString, content, hdr); dbStorage.AddDocument(d); //HtmlDocument doc = new HtmlDocument(); //doc.LoadHtml(robots); } catch (Exception e) { string pgmsg = string.Format("Robots.txt ОШИБКА: {0}", e.Message); kernel.Log(pgmsg, "Core", LogType.Error); // kernel.Log(e.StackTrace, "Core", LogType.Error); } } while (mtc.SiteQueue.HasUnscanned && !mtc.Stop) { //получение адреса из очереди Page pg = mtc.SiteQueue.GetNextUnscanned(); //запрос try { WebResponse w = kernel.MakeRequest(pg.Adress); System.Net.Mime.ContentType ct = new System.Net.Mime.ContentType(w.ContentType); Encoding fEncoding = Encoding.Default; if (!string.IsNullOrEmpty(ct.CharSet)) fEncoding = Encoding.GetEncoding(ct.CharSet); Stream dataStream = w.GetResponseStream(); StreamReader reader = new StreamReader(dataStream, fEncoding); string content = reader.ReadToEnd(); string scanmsg = string.Format("Сканирую страницу: {1}", Thread.CurrentThread.ManagedThreadId, pg.Adress); kernel.Log(scanmsg, "Core", LogType.Message); List<FoundText> ft = null; if (ct.MediaType.StartsWith("text", StringComparison.InvariantCultureIgnoreCase)) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(content); //получение ссылок на другие страницы SiteList links = GetLinksFromHTMLDocument(doc); foreach (string link in links) { string[] parts = link.Split('?'); parts = parts[0].Split('#'); mtc.SiteQueue.Add(parts[0]); } //получение "чистого" текста string plainText = HTMLUtils.GetPlainText(doc); ft = HTMLUtils.GetMatches(plainText, kernel.SearchWords.ToArray()); } if ((ft != null) && (ft.Count > 0)) { string hdr = HTMLUtils.GetHeaderAsString(w); //запись в базу DBDocument d = dbStorage.CreateDocument(pg.Adress.OriginalString, content, hdr); dbStorage.AddDocument(d); } //kernel.Log(string.Format("Document added: {0}", pg.Adress), "Core", LogType.Message); mtc.SiteQueue[pg] = true; Thread.Sleep(mtc.kernel.SiteScanDelay); } catch (Exception e) { string pgmsg = string.Format("Страница: '{0}' Ошибка: {1}", pg.Adress, e.Message); kernel.Log(pgmsg, "Core", LogType.Error); // kernel.Log(e.StackTrace, "Core", LogType.Error); } } } catch (Exception e) { kernel.Log(e.Message, "Core", LogType.Error); kernel.Log(e.StackTrace, "Core", LogType.Error); } finally { dbStorage.Dispose(); } } }
private void FilterDocList(object _df) { AddDocumentGridDelegate deleg = new AddDocumentGridDelegate(AddDocumentGrid); DocumentFilter df = (DocumentFilter)_df; if (df.Count > 0) { DBDocumentStorage ds = new DBDocumentStorage(DBstorage.DB.ConnectionString); List<Guid> docids = ds.GetDocumentsIDs(); docs.Clear(); int i = 0; int scancount = 0; foreach (Guid docid in docids) { if (StopSearch) break; bool flag = false; DBDocument d = ds.GetDocumentByID(docid); try { string ct = d.LastVersion.HeadersCollection[HttpResponseHeader.ContentType]; if (ct.StartsWith("text")) { string src = HTMLUtils.GetPlainText(d.LastVersion.DataAsText); /*foreach (string word in df) { if (src.IndexOf(word) > -1) { flag = true; if (foundword != "") foundword = foundword + ", "; foundword += word; //break; } }*/ List<FoundText> matches = HTMLUtils.GetMatches(src, df.ToArray()); if (matches.Count >0 ) { docs.Add(d.ID, d.URL); string FoundWords = ""; foreach(FoundText m in matches) { if (FoundWords != "") FoundWords = FoundWords+ ", " ; FoundWords = FoundWords + string.Format("({0},{1}) {2}", m.Index, m.Length, src.Substring(m.Index, m.Length)); } dgv.Invoke(deleg, i++, d.ID, d.URL, FoundWords); } } } catch (Exception ex) { Log(ex); } finally { scancount++; string message = string.Format("Scanned {0}/{1} documents. Found {2} documents.", scancount, docids.Count, i); Log(message); d.Dispose(); } } } }