public void HtmlProcessor_LinkInfo_Check() { TldParser parser = new TldParser(); HtmlProcessor.LinkInfo li = new HtmlProcessor.LinkInfo(parser); li.Href = "google.com"; Assert.IsTrue(li.Domain == "www.google.com"); //Assert.IsTrue(li.DomainOrSubdomain == "www.google.com"); Assert.IsTrue(li.DomainScheme == "http"); Assert.IsTrue(li.Tld == "com"); li.Href = "jubacs.somee.net.ph"; Assert.IsTrue(li.Domain == "www.jubacs.somee.net.ph"); //Assert.IsTrue(li.DomainOrSubdomain == "www.jubacs.somee.net.ph"); Assert.IsTrue(li.DomainScheme == "http"); Assert.IsTrue(li.Tld == "net.ph"); li.Href = "http://www.jubacs.somee.com"; Assert.IsTrue(li.Domain == "www.jubacs.somee.com"); }
private static void TestRun1(int WORK_AREA_TOP, int parallelCount, Repository repository, RobotService robots, CollectorPool pool, VisitedUrls history, TldParser tldParser) { CreateCollectorPool(pool, history, parallelCount).ForEach( c => { //ManualResetEvent done = new ManualResetEvent(false); Dictionary<string, object> properties = new Dictionary<string, object>(); properties.Add("TldParser", tldParser); Uri uri = c.Link.ToUri(); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (fetching)", uri.ToUriShort()); WebDownloader web = new WebDownloader(uri, properties, ea => { WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} [{1}] ({2})", uri.ToUriShort(), ea.Status, (ea.Exception.IsNull() ? "responded: " + ea.Stream.Length + " bytes" : "exception: " + ea.Exception.Message)); if (ea.Stream.IsNull()) { Thread.Sleep(5000); } else { HtmlProcessor processor = new HtmlProcessor( uri.ToString(), ea.Stream, ((TldParser)ea.Properties["TldParser"])); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1})", uri.ToUriShort(), processor.Links.Count); int pushedLinks = 0; int linkCounter = 1; processor.Links.ForEach(l => { WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1}, prc={2} {3} ({4}))", uri.ToUriShort(), processor.Links.Count, (l.Domain.Length > 10 ? l.Domain.Substring(0, 10) : l.Domain), l.Tld, linkCounter); if (robots.IsAllowed(string.Empty, l.Href.ToUri())) { ++pushedLinks; pool.Store(l.Href); repository.SaveLink(uri.ToString(), l.Href, string.Empty, l); repository.SaveLink(l.Href, string.Empty, uri.ToString(), l); history.Add(uri.ToString()); } ++linkCounter; }); WriteXY(0, WORK_AREA_TOP + c.SeqNo, "{0} (found={1}, added={2} links) [DONE]", uri.ToUriShort(), processor.Links.Count, pushedLinks); ea.Stream.Close(); } //((ManualResetEvent)ea.Properties["State"]).Set(); }); web.Download(); //done.WaitOne(); }); }
static void Main(string[] args) { // arguments // --help // --provider=redis|mysql|sqlserver // --server=hostname // --port= // --database= // --uid= // --pwd= // --parallel-count LogBuffer = new StringBuilder(); int parallelCount = 70; //string provider = "redis"; //string server = "127.0.0.1"; //string port = "6379"; string provider = "mongodb"; string server = "50.62.1.71"; string port = "27017"; string database = "ls"; Console.CancelKeyPress += new ConsoleCancelEventHandler(Console_CancelKeyPress); PrintAndClearHeaderArea(); LogLine("Buffer width: " + Console.BufferWidth); LogLine("Buffer height: " + Console.BufferHeight); IPersistence persistence = PersistenceFactory.GetPersistence(provider, new Dictionary<string, string> { { "server", server }, { "port", port }, { "database", database } }); if (!persistence.Ping()) { LogLine("Unable to connect to the database. Aborting."); Environment.Exit(1); } Repository repository = new Repository(persistence); RobotService robots = new RobotService(); CollectorPool pool; VisitedUrls history; VisitedDomains domainHistory = new VisitedDomains(); int poolCount = 0; Log("Loading pool..."); repository.Load(out pool); pool.Store("squidoo.com"); pool.Store("ezinearticles.com"); pool.Store("hubpages.com"); pool.Store("technorati.com"); pool.Store("buzzle.com"); pool.Store("suite101.com"); pool.Store("goarticles.com"); pool.Store("apsense.com"); pool.Store("allaboutcounseling.com"); pool.Store("digg.com"); pool.Store("dmoz.org"); pool.Store("dir.yahoo.com"); poolCount = pool.Count; LogLine("done. Found " + poolCount); Log("Loading link data..."); repository.LoadData(); LogLine("done. Found " + repository.Links.Count); Log("Loading history..."); repository.Load(out history, DateTime.Today); LogLine("done"); Log("Loading TLD parser..."); TldParser tldParser = new TldParser(); LogLine("done"); Thread.Sleep(5000); PrintAndClearHeaderArea(); SynchronizationContext.SetSynchronizationContext(new SynchronizationContext()); TaskScheduler scheduler = TaskScheduler.FromCurrentSynchronizationContext(); TaskScheduler.UnobservedTaskException += (o, ea) => { LogLine("Exception: {0}", ea.Exception.Message); ea.SetObserved(); }; DateTime start = DateTime.Now; CancellationTokenSource cancelTokenSource = new CancellationTokenSource(); CancellationToken cancelToken = cancelTokenSource.Token; //CountdownEvent countdown = new CountdownEvent(parallelCount); Task[] tasks = new Task[parallelCount]; for (int i = 0; i < parallelCount; i++) { TaskState state = new TaskState(); state.Pool = pool; state.VisitedUrls = history; state.VisitedDomains = domainHistory; state.TldParser = tldParser; state.Robots = robots; state.Repository = repository; state.PoolManagerHandler = PoolManager; state.ProgressHandler = CollectorProgress; state.CancelToken = cancelToken; //state.Countdown = countdown; state.CollectorID = i; tasks[i] = new Task(CollectorProcessor, state, cancelToken, TaskCreationOptions.LongRunning); tasks[i].Start(scheduler); //tasks[i].Start(); } //allDone.WaitOne(); WriteXY(0, WORK_AREA_TOP + parallelCount + 2, "press [ENTER] to quit"); Console.Read(); cancelTokenSource.Cancel(true); try { Task.WaitAll(tasks, cancelToken); //countdown.Wait(cancelToken); } catch { LogLine("Collectors stopped."); } Thread.Sleep(3000); // countdown.Dispose(); PrintAndClearHeaderArea(); LogLine("Closing database..."); repository.Commit<CollectorPool>(pool, null); ((IDisposable)persistence).Dispose(); // Print statistics DateTime elapsed = DateTime.Now; LogLine("Pool count: before = {0} now = {1}", poolCount, pool.Count); int totalLinks = repository.CrawlDateLinks[DateTime.Today.ToString("yyMMdd")].Value.Count; LogLine("Links crawled: {0} in {1} seconds ({2}/sec)", totalLinks, (elapsed - start).Seconds, (totalLinks) / (elapsed - start).Seconds); // Dump data to log //LogLine("Anchors"); //LogLine(repository.Anchors.JsonSerialize(true)); //LogLine("AnchorTextExactRelations"); //LogLine(repository.AnchorTextExactRelations.JsonSerialize(true)); //LogLine("DomainOrSubdomains"); //LogLine(repository.DomainOrSubdomains.JsonSerialize(true)); //LogLine("Domains"); //LogLine(repository.Domains.JsonSerialize(true)); //LogLine("LinkCrawlDateCurrent"); //LogLine(repository.LinkCrawlDateCurrent.JsonSerialize(true)); //LogLine("LinkCrawlDateHistory"); //LogLine(repository.LinkCrawlDateHistory.JsonSerialize(true)); //LogLine("LinkRating"); //LogLine(repository.LinkRating.JsonSerialize(true)); //LogLine("Links"); //LogLine(repository.Links.ToArray().Where((kv) => { return kv.Value.IsDirty; }).JsonSerialize(true)); //LogLine("LinkStatusCurrent"); //LogLine(repository.LinkStatusCurrent.JsonSerialize(true)); //LogLine("LinkStatusHistory"); //LogLine(repository.LinkStatusHistory.JsonSerialize(true)); // Write log to file SaveLog(); Environment.Exit(0); }
public void TldParserer() { Uri uri = new Uri("http://www.yahoo.com"); TldParser parser = new TldParser(); Assert.IsTrue(parser.GetTld(uri.Host) == "com"); }