示例#1
0
        /// <summary>
        ///  Threaded worker for scanning webpages. Automatically creates other copies
        ///  of itself and divides work for quicker scanning.
        /// </summary>
        /// <param name="id"></param>
        /// <param name="url"></param>
        /// <param name="startDepth"></param>
        /// <param name="sidedness"></param>
        /// <param name="data"></param>
        public CrawlWorker(int id, string url, int startDepth, bool sidedness, CrawlStruct data)
        {
            this.id           = id;
            this.startURL     = url;
            this.workingURL   = url;
            this.sidedness    = sidedness;
            this.startDepth   = startDepth;
            this.workingDepth = startDepth;
            this.data         = data;

            if (data.iterative)
            {
                this.workingURL += data.iteratorStart;
            }

            downloadManager   = new DownloadManager(data);
            this.pagesCrawled = 0;

            webStringUtils = new WebStringUtils(data.outputFolder);
            //Constructs string name from id and sidedness
            StringBuilder nameBuilder = new StringBuilder("Worker ").Append(id);

            if (!sidedness)
            {
                nameBuilder.Append(" (Left)");
            }
            else
            {
                nameBuilder.Append(" (Right)");
            }
            this.threadName = nameBuilder.ToString();
            Console.WriteLine("Creating " + threadName);
        }
        static void Main(string[] args)
        {
            //Handles early exits
            AppDomain.CurrentDomain.ProcessExit += new EventHandler(OnProcessExit);
            //Loads embedded DLLs into EXE if they don't load automatically
            AppDomain.CurrentDomain.AssemblyResolve += (sender, arguments) => {
                string resourceName = "AssemblyLoadingAndReflection." +
                                      new AssemblyName(arguments.Name).Name + ".dll";
                using (var stream = Assembly.GetExecutingAssembly()
                                    .GetManifestResourceStream(resourceName)) {
                    byte[] assemblyData = new byte[stream.Length];
                    stream.Read(assemblyData, 0, assemblyData.Length);
                    return(Assembly.Load(assemblyData));
                }
            };

            //Application initialization

            webStringUtils = new WebStringUtils(getAppFolder());
            objSaveUtils   = new ObjSaveUtils(getAppFolder() + '/');
            //Help
            //Options options = new Options();
            Options options = new Options();

            options.AddOption(new Option("h", "help", false, "display this help dialog"));
            options.AddOption(new Option("ph", "pattern help", false, "help with the -p command"));
            options.AddOption(new Option("v", "verbose", false, "verbose mode"));
            //options.AddOptionGroup(helpOptions);
            //Crawl options
            options.AddOption(new Option("dc", "dont crawl", false, "do not execute crawl. (for the purpose of using other utilities only)"));
            options.AddOption(new Option("vi", "visited", false, "print visited pages a after completion (n.i.)"));
            options.AddOption(new Option("ul", "unlock", false, "unlocks crawler from target domain"));
            options.AddOption(new Option("bc", "backcrawl", false, "deep copy, enables discovery of hidden pages (slow)"));
            options.AddOption(new Option("p", "pattern", true, "regex pattern for restricting pages"));
            options.AddOption(new Option("d", "depth", true, "depth of the search (default 10)"));
            options.AddOption(new Option("c", "courtesy", true, "delay between page loads, in milliseconds"));
            options.AddOption(new Option("t", "threads", true, "number of allowed threads. More threads = more aggressive (must be 2+)"));
            options.AddOption(new Option("i", "iterative", true, "scans urls iteratively in the form of url/1,2.. starting at <param>"));
            options.AddOption(new Option("id", "iterative depth", true, "Depth to scan to at each step of the iteration"));
            //File options
            options.AddOption(new Option("O", "overwrite", false, "overwrite files when scan starts"));
            Option downloadImages = new Option("di", "images", true, "download images while crawling (takes regex for filtering)");

            downloadImages.OptionalArg  = true;
            downloadImages.NumberOfArgs = 1;
            options.AddOption(downloadImages);
            Option downloadText = new Option("dt", "text", false, "download text bodies for analyzation <tag, regex>");

            downloadText.OptionalArg    = true;
            downloadText.NumberOfArgs   = 2;
            downloadText.ValueSeparator = ' ';
            options.AddOption(downloadText);
            options.AddOption(new Option("il", "include link", false, "include links to the parent page in text files"));
            options.AddOption(new Option("g", "gallery", false, "only download files to one folder"));
            options.AddOption(new Option("o", "output", true, "output location (defaults to exe location)"));
            options.AddOption(new Option("l", "load", true, "load data from previous scan, named <param>"));
            //Database options
            options.AddOption(new Option("dbl", "database links", false, "Save visited links into the DB, with tags defined by -dt"));
            options.AddOption(new Option("dbi", "database images", false, "Save image locations to database, with tags defined by -dt"));
            options.AddOption(new Option("ddh", "HTML", false, "don't download HTML while crawling"));
            options.AddOption(new Option("dbc", "database check", false, "Check the database to prevent duplicate entries (slow)"));
            options.AddOption(new Option("dbip", "database ip", true, "the IP address of the database to dump to"));
            //Data processing
            options.AddOption(new Option("m", "markov", true, "generate a markov chain of <param> prefix Length and saves it."));
            options.AddOption(new Option("mp", "print markov", true, "prints out [param] sentences from the chain (Must use -g)"));
            //Attempts to parse args
            try {
                ICommandLineParser parser = new PosixParser();
                //Help options
                CommandLine   helpCmd       = parser.Parse(options, args);
                HelpFormatter helpFormatter = new HelpFormatter();
                helpFormatter.Width       = 100;
                helpFormatter.DescPadding = 0x1;
                //string helpHeader = "\nSKS Web crawler/info extractor v0.1";
                string helpHeader = "\nSKS Web crawler/info extractor v0.1";
                string helpFooter = "\nExample Usage: java -jar [JARNAME] http://pornhub.com -di -d 5"
                                    + "\nSite Image Gallery: [URL] -di -ddh -g"
                                    + "\nFullsize gallery of 4chan thread: [URL] -di ^((?!s.).)*$ -ddh -g -p .*/((?!#[spq]).)*"
                                    + "\nSankaku tags on posts with urls: [URL] -g -il -ddh -dt title (.*)(/post/show/)(.*) -O -c 1000 -d 3"
                                    + "\nIterative booru tag crawl: [BASEURL] -g -il -ddh -dt title -O -c 1000 -d 1000 -i <startpage>"
                                    + "\nMarkov chain from 4chan board: [URL] -t 10 -d 15 -dt .post .* -m 2 -g -ddh -O -mp 40"
                                    + "\nInsert images into database with tags: [BOORUURL] -g -t 10 -di .*[/](_images/).* -ddh -d 10 -O -p .*[/]post/.* -ul -dt title -dbi";
                if (helpCmd.HasOption("ph"))
                {
                    Console.WriteLine("\n-p and -i take a regular exp. as an argument, searching all URLs"
                                      + "\nthat match the pattern. I.E., \"test.com/page \" would "
                                      + "\nmatch \"test.com/page/page2\". To test for any subdomain,"
                                      + "\nthe following pattern would operate on [anything].test.com:"
                                      + "\nhttps?://([^/.]+[.])*test.com(.*)");
                    return;
                }
                data.verbose = helpCmd.HasOption("v");
                //Crawl options
                CommandLine crawlCmd = parser.Parse(options, args);

                if (args.Length > 0)
                {
                    data.startURL = args[0];
                }
                data.backCrawl      = crawlCmd.HasOption("bc");
                data.iterative      = crawlCmd.HasOption("i");
                shouldCrawl         = !crawlCmd.HasOption("dc");
                data.iteratorStart  = Convert.ToInt32(crawlCmd.GetOptionValue("i", "0"));
                data.iterativeDepth = Convert.ToInt32(crawlCmd.GetOptionValue("id", "0"));
                data.crawlPattern   = crawlCmd.GetOptionValue("p", ".*");
                data.maxDepth       = Convert.ToInt32(crawlCmd.GetOptionValue("d", "5"));
                data.delay          = Convert.ToInt32(crawlCmd.GetOptionValue("c", "0"));
                crawlThreadExecutor = new LimitedConcurrencyLevelTaskScheduler(Convert.ToInt32(crawlCmd.GetOptionValue("t", "2")));
                crawlThreadFactory  = new TaskFactory(crawlThreadExecutor);
                crawlLocked         = !crawlCmd.HasOption("ul");

                //File options
                CommandLine fileCmd = parser.Parse(options, args);
                data.overwrite      = fileCmd.HasOption("O");
                data.downloadImages = fileCmd.HasOption("di");
                data.imagePattern   = fileCmd.GetOptionValue("di", "");
                data.downloadText   = fileCmd.HasOption("dt");
                data.downloadHTML   = !fileCmd.HasOption("ddh");
                data.gallery        = fileCmd.HasOption("g");

                if (data.downloadText)
                {
                    string[] imageOptions = fileCmd.GetOptionValues("dt");
                    //textTag = cmd.GetOptionValue("dt", null);
                    data.textTag = imageOptions[0];
                    try {
                        data.textPattern = imageOptions[1];
                    } catch (Exception) {
                        data.textPattern = "";
                    }
                    data.includeLinks = fileCmd.HasOption("il");
                }
                if (fileCmd.HasOption("l"))
                {
                    saveFile = fileCmd.GetOptionValue("l");
                    //Loads the chain
                    if (fileCmd.HasOption("m"))
                    {
                        markovChain = (MarkovChain)objSaveUtils.LoadObject("markov_" + saveFile, typeof(MarkovChain));
                    }
                    //Loads the tries
                    data.urlTrie    = (WebTrie)objSaveUtils.LoadObject("visitedTrie_" + saveFile, typeof(WebTrie));
                    data.assistTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie));
                    data.mediaTrie  = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie));
                }
                else
                {
                    if (args.Length > 0)
                    {
                        saveFile = webStringUtils.UnFuck(args[0]);
                    }
                    //If not loading chain from file, create new chain
                    if (fileCmd.HasOption("m"))
                    {
                        markovChain = new MarkovChain(Convert.ToInt32(fileCmd.GetOptionValue("m", "3")));
                    }
                    //Attempts to automatically load file name
                    try {
                        data.urlTrie    = (WebTrie)objSaveUtils.LoadObject("visitedTrie_" + saveFile, typeof(WebTrie));
                        data.assistTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie));
                        data.mediaTrie  = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie));
                    } catch (Exception) {
                        //Generate tries if not loadable
                        data.urlTrie    = new WebTrie();
                        data.assistTrie = new WebTrie();
                        data.mediaTrie  = new WebTrie();
                    }
                }
                data.outputFolder = fileCmd.GetOptionValue("o", getAppFolder()) + "CrawlResults\\";

                //Database options
                CommandLine dbCmd = parser.Parse(options, args);
                if (dbCmd.HasOption("dbip"))
                {
                    TagDBDriver.instantiateDB(dbCmd.GetOptionValue("dbip"));
                    data.dataBaseImages = dbCmd.HasOption("dbi");
                    data.dataBaseLinks  = dbCmd.HasOption("dbl");
                    data.dataBaseCheck  = dbCmd.HasOption("dbc");
                }

                //Data processing options
                CommandLine dpCmd = parser.Parse(options, args);
                printMarkov     = dpCmd.HasOption("mp");
                markovSentences = Convert.ToInt32(dpCmd.GetOptionValue("mp", "0"));

                if (helpCmd.HasOption("h") || args.Length == 0)
                {
                    printHelp();
                    return;
                }
            } catch (Exception exception) {
                Console.WriteLine("Invalid arguments or parameters. use -h for help (" + exception + ")");
                return;
            }
            //instantiates trie

            //creates regex for site locking
            if (crawlLocked)
            {
                string regexURL = Regex.Replace(args[0], "https?://", "");
                data.crawlDomain = "https?://([^/.]+[.])*" + regexURL + "(.*)";
            }
            else
            {
                data.crawlDomain = ".*";
            }

            try {
                Crawl(args[0], data);
            } catch (Exception e) {
                Console.WriteLine("Scan aborted: " + e);
            }
            // System.exit(0);
        }
 public DownloadManager(CrawlStruct data)
 {
     DownloadManager.data = data;
     webStringUtils       = new WebStringUtils(data.outputFolder);
     saveQueue            = new SaveQueue();
 }