コード例 #1
0
        static void Main(string[] args)
        {
            //Handles early exits
            AppDomain.CurrentDomain.ProcessExit += new EventHandler(OnProcessExit);
            //Loads embedded DLLs into EXE if they don't load automatically
            AppDomain.CurrentDomain.AssemblyResolve += (sender, arguments) => {
                string resourceName = "AssemblyLoadingAndReflection." +
                                      new AssemblyName(arguments.Name).Name + ".dll";
                using (var stream = Assembly.GetExecutingAssembly()
                                    .GetManifestResourceStream(resourceName)) {
                    byte[] assemblyData = new byte[stream.Length];
                    stream.Read(assemblyData, 0, assemblyData.Length);
                    return(Assembly.Load(assemblyData));
                }
            };

            //Application initialization

            webStringUtils = new WebStringUtils(getAppFolder());
            objSaveUtils   = new ObjSaveUtils(getAppFolder() + '/');
            //Help
            //Options options = new Options();
            Options options = new Options();

            options.AddOption(new Option("h", "help", false, "display this help dialog"));
            options.AddOption(new Option("ph", "pattern help", false, "help with the -p command"));
            options.AddOption(new Option("v", "verbose", false, "verbose mode"));
            //options.AddOptionGroup(helpOptions);
            //Crawl options
            options.AddOption(new Option("dc", "dont crawl", false, "do not execute crawl. (for the purpose of using other utilities only)"));
            options.AddOption(new Option("vi", "visited", false, "print visited pages a after completion (n.i.)"));
            options.AddOption(new Option("ul", "unlock", false, "unlocks crawler from target domain"));
            options.AddOption(new Option("bc", "backcrawl", false, "deep copy, enables discovery of hidden pages (slow)"));
            options.AddOption(new Option("p", "pattern", true, "regex pattern for restricting pages"));
            options.AddOption(new Option("d", "depth", true, "depth of the search (default 10)"));
            options.AddOption(new Option("c", "courtesy", true, "delay between page loads, in milliseconds"));
            options.AddOption(new Option("t", "threads", true, "number of allowed threads. More threads = more aggressive (must be 2+)"));
            options.AddOption(new Option("i", "iterative", true, "scans urls iteratively in the form of url/1,2.. starting at <param>"));
            options.AddOption(new Option("id", "iterative depth", true, "Depth to scan to at each step of the iteration"));
            //File options
            options.AddOption(new Option("O", "overwrite", false, "overwrite files when scan starts"));
            Option downloadImages = new Option("di", "images", true, "download images while crawling (takes regex for filtering)");

            downloadImages.OptionalArg  = true;
            downloadImages.NumberOfArgs = 1;
            options.AddOption(downloadImages);
            Option downloadText = new Option("dt", "text", false, "download text bodies for analyzation <tag, regex>");

            downloadText.OptionalArg    = true;
            downloadText.NumberOfArgs   = 2;
            downloadText.ValueSeparator = ' ';
            options.AddOption(downloadText);
            options.AddOption(new Option("il", "include link", false, "include links to the parent page in text files"));
            options.AddOption(new Option("g", "gallery", false, "only download files to one folder"));
            options.AddOption(new Option("o", "output", true, "output location (defaults to exe location)"));
            options.AddOption(new Option("l", "load", true, "load data from previous scan, named <param>"));
            //Database options
            options.AddOption(new Option("dbl", "database links", false, "Save visited links into the DB, with tags defined by -dt"));
            options.AddOption(new Option("dbi", "database images", false, "Save image locations to database, with tags defined by -dt"));
            options.AddOption(new Option("ddh", "HTML", false, "don't download HTML while crawling"));
            options.AddOption(new Option("dbc", "database check", false, "Check the database to prevent duplicate entries (slow)"));
            options.AddOption(new Option("dbip", "database ip", true, "the IP address of the database to dump to"));
            //Data processing
            options.AddOption(new Option("m", "markov", true, "generate a markov chain of <param> prefix Length and saves it."));
            options.AddOption(new Option("mp", "print markov", true, "prints out [param] sentences from the chain (Must use -g)"));
            //Attempts to parse args
            try {
                ICommandLineParser parser = new PosixParser();
                //Help options
                CommandLine   helpCmd       = parser.Parse(options, args);
                HelpFormatter helpFormatter = new HelpFormatter();
                helpFormatter.Width       = 100;
                helpFormatter.DescPadding = 0x1;
                //string helpHeader = "\nSKS Web crawler/info extractor v0.1";
                string helpHeader = "\nSKS Web crawler/info extractor v0.1";
                string helpFooter = "\nExample Usage: java -jar [JARNAME] http://pornhub.com -di -d 5"
                                    + "\nSite Image Gallery: [URL] -di -ddh -g"
                                    + "\nFullsize gallery of 4chan thread: [URL] -di ^((?!s.).)*$ -ddh -g -p .*/((?!#[spq]).)*"
                                    + "\nSankaku tags on posts with urls: [URL] -g -il -ddh -dt title (.*)(/post/show/)(.*) -O -c 1000 -d 3"
                                    + "\nIterative booru tag crawl: [BASEURL] -g -il -ddh -dt title -O -c 1000 -d 1000 -i <startpage>"
                                    + "\nMarkov chain from 4chan board: [URL] -t 10 -d 15 -dt .post .* -m 2 -g -ddh -O -mp 40"
                                    + "\nInsert images into database with tags: [BOORUURL] -g -t 10 -di .*[/](_images/).* -ddh -d 10 -O -p .*[/]post/.* -ul -dt title -dbi";
                if (helpCmd.HasOption("ph"))
                {
                    Console.WriteLine("\n-p and -i take a regular exp. as an argument, searching all URLs"
                                      + "\nthat match the pattern. I.E., \"test.com/page \" would "
                                      + "\nmatch \"test.com/page/page2\". To test for any subdomain,"
                                      + "\nthe following pattern would operate on [anything].test.com:"
                                      + "\nhttps?://([^/.]+[.])*test.com(.*)");
                    return;
                }
                data.verbose = helpCmd.HasOption("v");
                //Crawl options
                CommandLine crawlCmd = parser.Parse(options, args);

                if (args.Length > 0)
                {
                    data.startURL = args[0];
                }
                data.backCrawl      = crawlCmd.HasOption("bc");
                data.iterative      = crawlCmd.HasOption("i");
                shouldCrawl         = !crawlCmd.HasOption("dc");
                data.iteratorStart  = Convert.ToInt32(crawlCmd.GetOptionValue("i", "0"));
                data.iterativeDepth = Convert.ToInt32(crawlCmd.GetOptionValue("id", "0"));
                data.crawlPattern   = crawlCmd.GetOptionValue("p", ".*");
                data.maxDepth       = Convert.ToInt32(crawlCmd.GetOptionValue("d", "5"));
                data.delay          = Convert.ToInt32(crawlCmd.GetOptionValue("c", "0"));
                crawlThreadExecutor = new LimitedConcurrencyLevelTaskScheduler(Convert.ToInt32(crawlCmd.GetOptionValue("t", "2")));
                crawlThreadFactory  = new TaskFactory(crawlThreadExecutor);
                crawlLocked         = !crawlCmd.HasOption("ul");

                //File options
                CommandLine fileCmd = parser.Parse(options, args);
                data.overwrite      = fileCmd.HasOption("O");
                data.downloadImages = fileCmd.HasOption("di");
                data.imagePattern   = fileCmd.GetOptionValue("di", "");
                data.downloadText   = fileCmd.HasOption("dt");
                data.downloadHTML   = !fileCmd.HasOption("ddh");
                data.gallery        = fileCmd.HasOption("g");

                if (data.downloadText)
                {
                    string[] imageOptions = fileCmd.GetOptionValues("dt");
                    //textTag = cmd.GetOptionValue("dt", null);
                    data.textTag = imageOptions[0];
                    try {
                        data.textPattern = imageOptions[1];
                    } catch (Exception) {
                        data.textPattern = "";
                    }
                    data.includeLinks = fileCmd.HasOption("il");
                }
                if (fileCmd.HasOption("l"))
                {
                    saveFile = fileCmd.GetOptionValue("l");
                    //Loads the chain
                    if (fileCmd.HasOption("m"))
                    {
                        markovChain = (MarkovChain)objSaveUtils.LoadObject("markov_" + saveFile, typeof(MarkovChain));
                    }
                    //Loads the tries
                    data.urlTrie    = (WebTrie)objSaveUtils.LoadObject("visitedTrie_" + saveFile, typeof(WebTrie));
                    data.assistTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie));
                    data.mediaTrie  = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie));
                }
                else
                {
                    if (args.Length > 0)
                    {
                        saveFile = webStringUtils.UnFuck(args[0]);
                    }
                    //If not loading chain from file, create new chain
                    if (fileCmd.HasOption("m"))
                    {
                        markovChain = new MarkovChain(Convert.ToInt32(fileCmd.GetOptionValue("m", "3")));
                    }
                    //Attempts to automatically load file name
                    try {
                        data.urlTrie    = (WebTrie)objSaveUtils.LoadObject("visitedTrie_" + saveFile, typeof(WebTrie));
                        data.assistTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie));
                        data.mediaTrie  = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie));
                    } catch (Exception) {
                        //Generate tries if not loadable
                        data.urlTrie    = new WebTrie();
                        data.assistTrie = new WebTrie();
                        data.mediaTrie  = new WebTrie();
                    }
                }
                data.outputFolder = fileCmd.GetOptionValue("o", getAppFolder()) + "CrawlResults\\";

                //Database options
                CommandLine dbCmd = parser.Parse(options, args);
                if (dbCmd.HasOption("dbip"))
                {
                    TagDBDriver.instantiateDB(dbCmd.GetOptionValue("dbip"));
                    data.dataBaseImages = dbCmd.HasOption("dbi");
                    data.dataBaseLinks  = dbCmd.HasOption("dbl");
                    data.dataBaseCheck  = dbCmd.HasOption("dbc");
                }

                //Data processing options
                CommandLine dpCmd = parser.Parse(options, args);
                printMarkov     = dpCmd.HasOption("mp");
                markovSentences = Convert.ToInt32(dpCmd.GetOptionValue("mp", "0"));

                if (helpCmd.HasOption("h") || args.Length == 0)
                {
                    printHelp();
                    return;
                }
            } catch (Exception exception) {
                Console.WriteLine("Invalid arguments or parameters. use -h for help (" + exception + ")");
                return;
            }
            //instantiates trie

            //creates regex for site locking
            if (crawlLocked)
            {
                string regexURL = Regex.Replace(args[0], "https?://", "");
                data.crawlDomain = "https?://([^/.]+[.])*" + regexURL + "(.*)";
            }
            else
            {
                data.crawlDomain = ".*";
            }

            try {
                Crawl(args[0], data);
            } catch (Exception e) {
                Console.WriteLine("Scan aborted: " + e);
            }
            // System.exit(0);
        }
コード例 #2
0
        //false = left, true = right
        //This should probably be split into two or three classes that inherit
        public int Crawl(string url, int currentDepth, bool sidedness)
        {
            //helpers at their base level do now download content
            bool isHelper = false;
            //Iterative searchers spawn recursors to assist
            bool isIterativeRecursor = false;

            if (data.iterative && !url.Substring(0, url.LastIndexOf('/') + 1).Equals(data.startURL))
            {
                isIterativeRecursor = true;
            }

            int newPages = 0;

            //Early return if url is not in the domain or if it has been previously checked
            if (!url.Equals(startURL))
            {
                //Checks to see if the URL has already been searched or if it's not in the domain- if so, terminate.
                if (!Regex.IsMatch(url, data.crawlPattern) || !Regex.IsMatch(url, data.crawlDomain) || data.urlTrie.contains(url))
                {
                    return(newPages);
                }
                //Bounces via DB if flag is checked
                if (data.dataBaseCheck)
                {
                    Task <bool> isInDB = Task.Run(() => TagDBDriver.entryExists(url));
                    isInDB.Wait();
                    data.urlTrie.InsertURL(url);
                    if (isInDB.Result)
                    {
                        return(newPages);
                    }
                }
                newPages++;
            }
            else if (url.Equals(startURL))
            {
                if (data.urlTrie.contains(url))
                {
                    isHelper = true;
                    //isIterativeRecursor = true;
                }
            }

            //Iterative helpers still check the URLTrie

            if (isIterativeRecursor && (data.urlTrie.contains(url) || !Regex.IsMatch(url, data.crawlPattern)))
            {
                return(newPages);
            }

            data.urlTrie.InsertURL(url);


            //Courtesy delaying
            Thread.Sleep(data.delay);

            if (data.verbose)
            {
                Console.WriteLine("Starting crawl on " + url);
            }

            Document HTMLDoc       = null;
            int      errorIterator = 0;

            while (HTMLDoc == null)
            {
                try {
                    HTMLDoc = getHTML(url);
                } catch (Exception exception) {
                    //Helpers do not increase the iterator
                    if (data.iterative && !isIterativeRecursor)
                    {
                        if (exception.ToString().Contains("404"))
                        {
                            iteratorLocation++;
                            Console.WriteLine("404. Now on page " + iteratorLocation + ". Increasing index....");
                        }
                        else if (exception.ToString().Contains("503"))
                        {
                            iteratorLocation++;
                            Console.WriteLine("503. Now on page " + iteratorLocation + ". Increasing index....");
                        }
                        else
                        {
                            throw exception;
                        }
                    }

                    if (exception.ToString().Contains("429"))
                    {
                        //Handling for rate limit exceptions
                        errorIterator++;
                        if (errorIterator < 1)
                        {
                            Console.WriteLine(exception);
                            Console.WriteLine("Rate limited. waiting...");
                            Thread.Sleep(15000 + data.delay);
                            Console.WriteLine("Retrying...");
                        }
                        else
                        {
                            Console.WriteLine("Continued rate limiting. Thread waiting for one minute and increasing courtesy delay.");
                            Thread.Sleep(60000);
                            data.delay   += 10000;
                            errorIterator = 0;
                        }
                    }
                    else
                    {
                        CU.WCol(CU.nl + "Could not load page. " + url + " : " + exception.Message + CU.nl, CU.r);
                        return(newPages);
                    }
                }
            }
            //Grab links
            Elements links         = HTMLDoc.Select("a[href]");
            int      numberOfLinks = links.Count();
            //Grabs the page title
            string titleString = HTMLDoc.Title;

            if (titleString != null)
            {
                titleString = HTMLDoc.Title;
            }
            else
            {
                titleString = "Untitled";
            }
            if (!data.verbose)
            {
                if (!isHelper)
                {
                    Console.WriteLine(threadName + " Crawling " + url.Truncate(40) + "(" + titleString.Truncate(40) + ")");
                }
                else
                {
                    Console.WriteLine(threadName + " Finishing " + url.Truncate(40) + "(" + titleString.Truncate(40) + ")");
                }
            }
            if (data.verbose)
            {
                Console.WriteLine("Page name: " + titleString);
            }
            //Writes content to file
            try {
                //Refuse files if crawler is helper at level
                if (!isHelper)
                {
                    //Prep information for DB entries
                    FileInfo[] files = null;
                    string[]   tags  = null;
                    //Download HTML
                    if (data.downloadHTML)
                    {
                        Thread downloadHTML = new Thread(() => downloadManager.downloadHTML(HTMLDoc));
                        downloadHTML.Start();
                    }
                    //Download text within specified tags (-dt [tag])
                    if (data.downloadText)
                    {
                        Elements        text         = HTMLDoc.Select(data.textTag);
                        Task <string[]> downloadText = new Task <string[]>(() => downloadManager.downloadText(text));
                        downloadText.Start();
                        tags = downloadText.Result;
                    }
                    //Download images and links to images
                    if (data.downloadImages)
                    {
                        //Checks for links to images
                        Elements imageElements = HTMLDoc.Select("img");
                        if (imageElements != null)
                        {
                            //Append links to images as well

                            foreach (Element element in links)
                            {
                                if (Regex.IsMatch(element.AbsUrl("href"), ".*(.jpg|.png|.gif|.webm)"))
                                {
                                    imageElements.Add(element);
                                }
                            }
                        }
                        Task <FileInfo[]> downloadImages = new Task <FileInfo[]>(() => downloadManager.DownloadElementsReturnNames(imageElements));
                        downloadImages.Start();
                        files = downloadImages.Result;
                    }
                    //Saves image locations to Database
                    if (data.dataBaseImages)
                    {
                        foreach (FileInfo file in files)
                        {
                            new Thread(() => TagDBDriver.insertImageWithTags(file.FullName, tags)).Start();
                        }
                    }
                    //Saves links to Database
                    if (data.dataBaseLinks)
                    {
                        new Thread(() => TagDBDriver.insertImageWithTags(url, tags)).Start();
                    }
                }
            } catch (Exception e) {
                Console.WriteLine("Could not write to file: " + e);
            }

            //Checks if the search needs to recurse
            if (numberOfLinks <= 0)
            {
                if (data.verbose)
                {
                    Console.WriteLine("No links on page. Going back up...");
                }
                return(newPages);
            }
            //if the crawl is iterative, do not recurse
            try {
                //Recurses the algorithm if not at max depth
                if (currentDepth + 1 > data.maxDepth && !data.iterative)
                {
                    return(newPages);
                }
                //Do shallow recursion while in iterative mode
                else if (currentDepth + 1 >= data.iterativeDepth && isIterativeRecursor)
                {
                    return(newPages);
                }

                if (numberOfLinks > data.linkAssist && !data.assistTrie.contains(url) && !data.iterative)
                {
                    data.assistTrie.InsertURL(url);
                    this.workingURL   = url;
                    this.workingDepth = currentDepth;
                    createHelper();
                }

                //Right-handed search
                //can these be one method?
                int sizeLimit = (int)Math.Round(numberOfLinks / 2f);
                if (sidedness)
                {
                    for (int i = numberOfLinks - 1; i > 0; i--)
                    {
                        //Only search half at the entry depth
                        if (currentDepth == startDepth && i < sizeLimit)
                        {
                            break;
                        }
                        string currentLinkRight = links[i].AbsUrl("href");
                        //Checks to make sure that the URL isn't a page in-reference and that it doesn't link to another part of the page. Also ensures link validity.
                        //Also ignore links to other pages positioned along the iterative crawl
                        if (string.IsNullOrEmpty(currentLinkRight) || currentLinkRight.Equals(url) ||
                            (data.iterative && currentLinkRight.Substring(0, currentLinkRight.LastIndexOf('/') + 1).Equals(data.startURL)) ||
                            (currentLinkRight.Contains('#') && currentLinkRight.Substring(0, currentLinkRight.LastIndexOf('#')).Equals(url)) ||
                            links[i].HasAttr("rel"))
                        {
                            {
                                i--;
                                continue;
                            }
                        }
                        //Ensures the link can be connect to- if not, iterate to the next link
                        try {
                            WebRequest.Create(currentLinkRight);
                        } catch (Exception) {
                            i--;
                            continue;
                        }
                        newPages += Crawl(currentLinkRight, currentDepth + 1, sidedness);
                    }
                } //Left-handed search
                else
                {
                    for (int i = 0; i < numberOfLinks - 1; i++)
                    {
                        if (currentDepth == startDepth && i > sizeLimit)
                        {
                            break;
                        }
                        string currentLinkLeft = links[i].AbsUrl("href");
                        string test            = currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('/') + 1);
                        if (string.IsNullOrEmpty(currentLinkLeft) || currentLinkLeft.Equals(url) ||
                            (data.iterative && currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('/') + 1).Equals(data.startURL)) ||
                            (currentLinkLeft.Contains('#') && currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('#')).Equals(url)) ||
                            links[i].HasAttr("rel"))
                        {
                            i++;
                            continue;
                        }
                        try {
                            WebRequest.Create(currentLinkLeft);
                        } catch (Exception) {
                            i++;
                            continue;
                        }
                        newPages += Crawl(currentLinkLeft, currentDepth + 1, sidedness);
                    }
                }


                //Backcrawl to hit missed directoies at the level
                if (data.backCrawl)
                {
                    while (url.Substring(8).Contains("/"))
                    {
                        Console.WriteLine("Backcrawling unfound urls...");
                        Crawl(url = url.Substring(0, url.LastIndexOf('/') - 1), currentDepth - 1, sidedness);
                    }
                }
            } catch (Exception e) {
                CU.WCol(CU.nl + "Dead page: " + e, CU.r, CU.y);
                CU.WCol(CU.nl + e.StackTrace, CU.r, CU.y);
                Console.WriteLine("Now checking depth " + currentDepth + ", link: " + url);
            }
            return(newPages);
        }