private List <Uri> GetLinks(string html) { var target = new LinkExtractor(); var result = target.GetHyperlinks(html); return(result); }
public MainWindowViewModel() { _linkExtractor = new LinkExtractor(); _cancellationTokenSource = new CancellationTokenSource(); UrlData = new ObservableCollection <BaseUrlInfo>(); AnalyzeUrlCommand = new BaseCommand(AnalyzeUrl); OpenFileCommand = new BaseCommand(OpenFile); CloseAppCommand = new BaseCommand(CloseApp); State = MainWindowState.Idle; }
static void Main(string[] args) { CrawlerSettings settings = new CrawlerSettings() { Function = MyFunction, OutputPath = "Sample.txt", RespectRobots = true, Seeds = new string[] { @"http://5by5.tv/", @"http://maximumfun.org/", @"https://www.relay.fm/" }, MaxDepth = 8, WorkerCount = 64 }; IEnumerable <string> banedExts = new string[] { // images ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".ai", ".drw", ".dxf", ".eps", ".ps", ".svg", // audio ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".au", ".aiff", // video ".3gp", ".asf", ".asx", ".avi", ".mov", ".mp4", ".mpg", ".qt", ".rm", ".swf", ".wmv", ".m4a", //other ".css", ".pdf", ".doc", ".exe", ".bin", ".rss", ".zip", ".rar" }; IEnumerable <string> bannedUrls = new string[] { "twitter.com", "youtube.com", "reddit.com", "facebook.com", "amazon.com", "itunes.apple.com", "firstpost.com", "wikipedia.org", "play.google.com", "pinterest.com" }; s_extractor = new LinkExtractor(banedExts, bannedUrls); Crawler crawler = new Crawler(settings); crawler.Crawl(); }
private void ProcessPageForImageLinks(PageData obPageData, Int32 iLevel, Int32 iStopLevel, bool bFlatHierarchy) { LinkDataCollection obLinks = new LinkDataCollection(); LinkExtractor obLinkExtractor = new LinkExtractor(obPageData.Url, iLevel); do { LinkStatus status = obLinkExtractor.ExtractLinks(); LinkDataCollection tempColl = obLinkExtractor.Links; ICollectionSupport.AddAll(obPageData.m_Outlinks, tempColl); if (iLevel >= iStopLevel) { break; } iLevel++; tempColl = new LinkDataCollection(); foreach (LinkData obLinkData in obPageData.m_Outlinks) { if (obLinkData.LinkType == LinkType.Outlink) { PageData obPage = new PageData(); obPage.m_strUrl = obLinkData.Url; ProcessPageForLinks(obPage, iLevel, iStopLevel, bFlatHierarchy); if (bFlatHierarchy) { ICollectionSupport.AddAll(tempColl, obPage.OutLinks); } } } if (bFlatHierarchy) { ICollectionSupport.AddAll(obPageData.m_Outlinks, tempColl); } }while(true); }
public LinkExtractorTests() { target = new LinkExtractor(); }
public void testLinkShortHTTPSNoJpg() { LinkExtractor testLE = new LinkExtractor(); Assert.AreEqual("Found no jpgs", testLE.linkInString("hej med dig https://goo.gl/bERbbS")); }
public void testLinkShortHTTPS() { LinkExtractor testLE = new LinkExtractor(); Assert.AreEqual("https://test.jpg/", testLE.linkInString("hej med dig https://goo.gl/X2KftW")); }
public void testLinkMultipleHTTPS() { LinkExtractor testLE = new LinkExtractor(); Assert.AreEqual("https://test.jpg", testLE.linkInString("hej med dig https://test.jpg https://test2.jpg")); }
public void testLinkNoHTTPS() { LinkExtractor testLE = new LinkExtractor(); Assert.AreEqual("No links", testLE.linkInString("hej med dig")); }
public void testLinkNotJpg() { LinkExtractor testLE = new LinkExtractor(); Assert.AreEqual("Found no jpgs", testLE.linkInString("hej med dig https://test.png")); }
public void testLinkWithJpg() { LinkExtractor testLE = new LinkExtractor(); Assert.AreEqual("https://test.jpg", testLE.linkInString("hej med dig https://test.jpg")); }
public static string StartSearch(SearchRequest details) { try { count = 0; WebClient client = new WebClient(); using (Stream data = client.OpenRead(details.urlofWebpage)) { using (StreamReader reader = new StreamReader(data)) { string content = reader.ReadToEnd(); List <string> list = LinkExtractor.Extract(content); string result = list.FirstOrDefault(x => x == details.urlofWebpage); if (result == null) { SearchContent(details.urlofWebpage, details.searchExpression); } foreach (var link in list) { //-------------------------------- SearchContent(link, details.searchExpression); //------------------------------------ Console.WriteLine(link); } List <string> valueToInsert = new List <string>(); valueToInsert.Add(DateTime.UtcNow.ToString("dd-MM-yyyy")); valueToInsert.Add(details.urlofWebpage); valueToInsert.Add(details.searchExpression); valueToInsert.Add(count.ToString()); valueToInsert.ToArray(); String query = "INSERT INTO [dbo].[SearchReport] (Date,URL,Printout,NoOfHits) VALUES (@Date,@URL,@Printout, @NoOfHits)"; SqlCommand cmd = new SqlCommand(query, sqlconn); cmd.Parameters.AddWithValue("@Date", DateTime.UtcNow.ToString("dd-MM-yyyy")); cmd.Parameters.AddWithValue("@URL", details.urlofWebpage); cmd.Parameters.AddWithValue("@Printout", details.searchExpression); cmd.Parameters.AddWithValue("@NoOfHits", count); sqlconn.Open(); cmd.ExecuteNonQuery(); message = "Success"; sqlconn.Close(); } } } catch (Exception ex) { sqlconn.Close(); message = "Failure"; } return(message); }
public LinkExtractionTests() { _linkExtractor = new LinkExtractor(); _linkMatcher = new LinkMatcher(); }