Esempio n. 1
0
        private List <Uri> GetLinks(string html)
        {
            var target = new LinkExtractor();
            var result = target.GetHyperlinks(html);

            return(result);
        }
Esempio n. 2
0
        public MainWindowViewModel()
        {
            _linkExtractor           = new LinkExtractor();
            _cancellationTokenSource = new CancellationTokenSource();

            UrlData = new ObservableCollection <BaseUrlInfo>();

            AnalyzeUrlCommand = new BaseCommand(AnalyzeUrl);
            OpenFileCommand   = new BaseCommand(OpenFile);
            CloseAppCommand   = new BaseCommand(CloseApp);
            State             = MainWindowState.Idle;
        }
Esempio n. 3
0
        static void Main(string[] args)
        {
            CrawlerSettings settings = new CrawlerSettings()
            {
                Function      = MyFunction,
                OutputPath    = "Sample.txt",
                RespectRobots = true,
                Seeds         = new string[] { @"http://5by5.tv/", @"http://maximumfun.org/", @"https://www.relay.fm/" },
                MaxDepth      = 8,
                WorkerCount   = 64
            };

            IEnumerable <string> banedExts = new string[]
            {
                // images
                ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif",
                ".tiff", ".ai", ".drw", ".dxf", ".eps", ".ps", ".svg",

                // audio
                ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".au", ".aiff",

                // video
                ".3gp", ".asf", ".asx", ".avi", ".mov", ".mp4", ".mpg", ".qt", ".rm", ".swf", ".wmv",
                ".m4a",

                //other
                ".css", ".pdf", ".doc", ".exe", ".bin", ".rss", ".zip", ".rar"
            };

            IEnumerable <string> bannedUrls = new string[]
            {
                "twitter.com",
                "youtube.com",
                "reddit.com",
                "facebook.com",
                "amazon.com",
                "itunes.apple.com",
                "firstpost.com",
                "wikipedia.org",
                "play.google.com",
                "pinterest.com"
            };

            s_extractor = new LinkExtractor(banedExts, bannedUrls);

            Crawler crawler = new Crawler(settings);

            crawler.Crawl();
        }
Esempio n. 4
0
        private void ProcessPageForImageLinks(PageData obPageData, Int32 iLevel, Int32 iStopLevel, bool bFlatHierarchy)
        {
            LinkDataCollection obLinks         = new LinkDataCollection();
            LinkExtractor      obLinkExtractor = new LinkExtractor(obPageData.Url, iLevel);

            do
            {
                LinkStatus         status   = obLinkExtractor.ExtractLinks();
                LinkDataCollection tempColl = obLinkExtractor.Links;

                ICollectionSupport.AddAll(obPageData.m_Outlinks, tempColl);

                if (iLevel >= iStopLevel)
                {
                    break;
                }
                iLevel++;
                tempColl = new LinkDataCollection();
                foreach (LinkData obLinkData in obPageData.m_Outlinks)
                {
                    if (obLinkData.LinkType == LinkType.Outlink)
                    {
                        PageData obPage = new PageData();
                        obPage.m_strUrl = obLinkData.Url;
                        ProcessPageForLinks(obPage, iLevel, iStopLevel, bFlatHierarchy);
                        if (bFlatHierarchy)
                        {
                            ICollectionSupport.AddAll(tempColl, obPage.OutLinks);
                        }
                    }
                }
                if (bFlatHierarchy)
                {
                    ICollectionSupport.AddAll(obPageData.m_Outlinks, tempColl);
                }
            }while(true);
        }
Esempio n. 5
0
 public LinkExtractorTests()
 {
     target = new LinkExtractor();
 }
Esempio n. 6
0
        public void testLinkShortHTTPSNoJpg()
        {
            LinkExtractor testLE = new LinkExtractor();

            Assert.AreEqual("Found no jpgs", testLE.linkInString("hej med dig https://goo.gl/bERbbS"));
        }
Esempio n. 7
0
        public void testLinkShortHTTPS()
        {
            LinkExtractor testLE = new LinkExtractor();

            Assert.AreEqual("https://test.jpg/", testLE.linkInString("hej med dig https://goo.gl/X2KftW"));
        }
Esempio n. 8
0
        public void testLinkMultipleHTTPS()
        {
            LinkExtractor testLE = new LinkExtractor();

            Assert.AreEqual("https://test.jpg", testLE.linkInString("hej med dig https://test.jpg https://test2.jpg"));
        }
Esempio n. 9
0
        public void testLinkNoHTTPS()
        {
            LinkExtractor testLE = new LinkExtractor();

            Assert.AreEqual("No links", testLE.linkInString("hej med dig"));
        }
Esempio n. 10
0
        public void testLinkNotJpg()
        {
            LinkExtractor testLE = new LinkExtractor();

            Assert.AreEqual("Found no jpgs", testLE.linkInString("hej med dig https://test.png"));
        }
Esempio n. 11
0
        public void testLinkWithJpg()
        {
            LinkExtractor testLE = new LinkExtractor();

            Assert.AreEqual("https://test.jpg", testLE.linkInString("hej med dig https://test.jpg"));
        }
        public static string StartSearch(SearchRequest details)
        {
            try
            {
                count = 0;

                WebClient client = new WebClient();
                using (Stream data = client.OpenRead(details.urlofWebpage))
                {
                    using (StreamReader reader = new StreamReader(data))
                    {
                        string        content = reader.ReadToEnd();
                        List <string> list    = LinkExtractor.Extract(content);
                        string        result  = list.FirstOrDefault(x => x == details.urlofWebpage);
                        if (result == null)
                        {
                            SearchContent(details.urlofWebpage, details.searchExpression);
                        }
                        foreach (var link in list)
                        {
                            //--------------------------------
                            SearchContent(link, details.searchExpression);
                            //------------------------------------

                            Console.WriteLine(link);
                        }

                        List <string> valueToInsert = new List <string>();

                        valueToInsert.Add(DateTime.UtcNow.ToString("dd-MM-yyyy"));
                        valueToInsert.Add(details.urlofWebpage);
                        valueToInsert.Add(details.searchExpression);
                        valueToInsert.Add(count.ToString());

                        valueToInsert.ToArray();


                        String query = "INSERT INTO [dbo].[SearchReport] (Date,URL,Printout,NoOfHits) VALUES (@Date,@URL,@Printout, @NoOfHits)";

                        SqlCommand cmd = new SqlCommand(query, sqlconn);

                        cmd.Parameters.AddWithValue("@Date", DateTime.UtcNow.ToString("dd-MM-yyyy"));
                        cmd.Parameters.AddWithValue("@URL", details.urlofWebpage);
                        cmd.Parameters.AddWithValue("@Printout", details.searchExpression);
                        cmd.Parameters.AddWithValue("@NoOfHits", count);

                        sqlconn.Open();
                        cmd.ExecuteNonQuery();

                        message = "Success";
                        sqlconn.Close();
                    }
                }
            }
            catch (Exception ex)
            {
                sqlconn.Close();
                message = "Failure";
            }
            return(message);
        }
 public LinkExtractionTests()
 {
     _linkExtractor = new LinkExtractor();
     _linkMatcher   = new LinkMatcher();
 }