/// <summary> /// Threaded worker for scanning webpages. Automatically creates other copies /// of itself and divides work for quicker scanning. /// </summary> /// <param name="id"></param> /// <param name="url"></param> /// <param name="startDepth"></param> /// <param name="sidedness"></param> /// <param name="data"></param> public CrawlWorker(int id, string url, int startDepth, bool sidedness, CrawlStruct data) { this.id = id; this.startURL = url; this.workingURL = url; this.sidedness = sidedness; this.startDepth = startDepth; this.workingDepth = startDepth; this.data = data; if (data.iterative) { this.workingURL += data.iteratorStart; } downloadManager = new DownloadManager(data); this.pagesCrawled = 0; webStringUtils = new WebStringUtils(data.outputFolder); //Constructs string name from id and sidedness StringBuilder nameBuilder = new StringBuilder("Worker ").Append(id); if (!sidedness) { nameBuilder.Append(" (Left)"); } else { nameBuilder.Append(" (Right)"); } this.threadName = nameBuilder.ToString(); Console.WriteLine("Creating " + threadName); }
static void Main(string[] args) { //Handles early exits AppDomain.CurrentDomain.ProcessExit += new EventHandler(OnProcessExit); //Loads embedded DLLs into EXE if they don't load automatically AppDomain.CurrentDomain.AssemblyResolve += (sender, arguments) => { string resourceName = "AssemblyLoadingAndReflection." + new AssemblyName(arguments.Name).Name + ".dll"; using (var stream = Assembly.GetExecutingAssembly() .GetManifestResourceStream(resourceName)) { byte[] assemblyData = new byte[stream.Length]; stream.Read(assemblyData, 0, assemblyData.Length); return(Assembly.Load(assemblyData)); } }; //Application initialization webStringUtils = new WebStringUtils(getAppFolder()); objSaveUtils = new ObjSaveUtils(getAppFolder() + '/'); //Help //Options options = new Options(); Options options = new Options(); options.AddOption(new Option("h", "help", false, "display this help dialog")); options.AddOption(new Option("ph", "pattern help", false, "help with the -p command")); options.AddOption(new Option("v", "verbose", false, "verbose mode")); //options.AddOptionGroup(helpOptions); //Crawl options options.AddOption(new Option("dc", "dont crawl", false, "do not execute crawl. (for the purpose of using other utilities only)")); options.AddOption(new Option("vi", "visited", false, "print visited pages a after completion (n.i.)")); options.AddOption(new Option("ul", "unlock", false, "unlocks crawler from target domain")); options.AddOption(new Option("bc", "backcrawl", false, "deep copy, enables discovery of hidden pages (slow)")); options.AddOption(new Option("p", "pattern", true, "regex pattern for restricting pages")); options.AddOption(new Option("d", "depth", true, "depth of the search (default 10)")); options.AddOption(new Option("c", "courtesy", true, "delay between page loads, in milliseconds")); options.AddOption(new Option("t", "threads", true, "number of allowed threads. More threads = more aggressive (must be 2+)")); options.AddOption(new Option("i", "iterative", true, "scans urls iteratively in the form of url/1,2.. starting at <param>")); options.AddOption(new Option("id", "iterative depth", true, "Depth to scan to at each step of the iteration")); //File options options.AddOption(new Option("O", "overwrite", false, "overwrite files when scan starts")); Option downloadImages = new Option("di", "images", true, "download images while crawling (takes regex for filtering)"); downloadImages.OptionalArg = true; downloadImages.NumberOfArgs = 1; options.AddOption(downloadImages); Option downloadText = new Option("dt", "text", false, "download text bodies for analyzation <tag, regex>"); downloadText.OptionalArg = true; downloadText.NumberOfArgs = 2; downloadText.ValueSeparator = ' '; options.AddOption(downloadText); options.AddOption(new Option("il", "include link", false, "include links to the parent page in text files")); options.AddOption(new Option("g", "gallery", false, "only download files to one folder")); options.AddOption(new Option("o", "output", true, "output location (defaults to exe location)")); options.AddOption(new Option("l", "load", true, "load data from previous scan, named <param>")); //Database options options.AddOption(new Option("dbl", "database links", false, "Save visited links into the DB, with tags defined by -dt")); options.AddOption(new Option("dbi", "database images", false, "Save image locations to database, with tags defined by -dt")); options.AddOption(new Option("ddh", "HTML", false, "don't download HTML while crawling")); options.AddOption(new Option("dbc", "database check", false, "Check the database to prevent duplicate entries (slow)")); options.AddOption(new Option("dbip", "database ip", true, "the IP address of the database to dump to")); //Data processing options.AddOption(new Option("m", "markov", true, "generate a markov chain of <param> prefix Length and saves it.")); options.AddOption(new Option("mp", "print markov", true, "prints out [param] sentences from the chain (Must use -g)")); //Attempts to parse args try { ICommandLineParser parser = new PosixParser(); //Help options CommandLine helpCmd = parser.Parse(options, args); HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.Width = 100; helpFormatter.DescPadding = 0x1; //string helpHeader = "\nSKS Web crawler/info extractor v0.1"; string helpHeader = "\nSKS Web crawler/info extractor v0.1"; string helpFooter = "\nExample Usage: java -jar [JARNAME] http://pornhub.com -di -d 5" + "\nSite Image Gallery: [URL] -di -ddh -g" + "\nFullsize gallery of 4chan thread: [URL] -di ^((?!s.).)*$ -ddh -g -p .*/((?!#[spq]).)*" + "\nSankaku tags on posts with urls: [URL] -g -il -ddh -dt title (.*)(/post/show/)(.*) -O -c 1000 -d 3" + "\nIterative booru tag crawl: [BASEURL] -g -il -ddh -dt title -O -c 1000 -d 1000 -i <startpage>" + "\nMarkov chain from 4chan board: [URL] -t 10 -d 15 -dt .post .* -m 2 -g -ddh -O -mp 40" + "\nInsert images into database with tags: [BOORUURL] -g -t 10 -di .*[/](_images/).* -ddh -d 10 -O -p .*[/]post/.* -ul -dt title -dbi"; if (helpCmd.HasOption("ph")) { Console.WriteLine("\n-p and -i take a regular exp. as an argument, searching all URLs" + "\nthat match the pattern. I.E., \"test.com/page \" would " + "\nmatch \"test.com/page/page2\". To test for any subdomain," + "\nthe following pattern would operate on [anything].test.com:" + "\nhttps?://([^/.]+[.])*test.com(.*)"); return; } data.verbose = helpCmd.HasOption("v"); //Crawl options CommandLine crawlCmd = parser.Parse(options, args); if (args.Length > 0) { data.startURL = args[0]; } data.backCrawl = crawlCmd.HasOption("bc"); data.iterative = crawlCmd.HasOption("i"); shouldCrawl = !crawlCmd.HasOption("dc"); data.iteratorStart = Convert.ToInt32(crawlCmd.GetOptionValue("i", "0")); data.iterativeDepth = Convert.ToInt32(crawlCmd.GetOptionValue("id", "0")); data.crawlPattern = crawlCmd.GetOptionValue("p", ".*"); data.maxDepth = Convert.ToInt32(crawlCmd.GetOptionValue("d", "5")); data.delay = Convert.ToInt32(crawlCmd.GetOptionValue("c", "0")); crawlThreadExecutor = new LimitedConcurrencyLevelTaskScheduler(Convert.ToInt32(crawlCmd.GetOptionValue("t", "2"))); crawlThreadFactory = new TaskFactory(crawlThreadExecutor); crawlLocked = !crawlCmd.HasOption("ul"); //File options CommandLine fileCmd = parser.Parse(options, args); data.overwrite = fileCmd.HasOption("O"); data.downloadImages = fileCmd.HasOption("di"); data.imagePattern = fileCmd.GetOptionValue("di", ""); data.downloadText = fileCmd.HasOption("dt"); data.downloadHTML = !fileCmd.HasOption("ddh"); data.gallery = fileCmd.HasOption("g"); if (data.downloadText) { string[] imageOptions = fileCmd.GetOptionValues("dt"); //textTag = cmd.GetOptionValue("dt", null); data.textTag = imageOptions[0]; try { data.textPattern = imageOptions[1]; } catch (Exception) { data.textPattern = ""; } data.includeLinks = fileCmd.HasOption("il"); } if (fileCmd.HasOption("l")) { saveFile = fileCmd.GetOptionValue("l"); //Loads the chain if (fileCmd.HasOption("m")) { markovChain = (MarkovChain)objSaveUtils.LoadObject("markov_" + saveFile, typeof(MarkovChain)); } //Loads the tries data.urlTrie = (WebTrie)objSaveUtils.LoadObject("visitedTrie_" + saveFile, typeof(WebTrie)); data.assistTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie)); data.mediaTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie)); } else { if (args.Length > 0) { saveFile = webStringUtils.UnFuck(args[0]); } //If not loading chain from file, create new chain if (fileCmd.HasOption("m")) { markovChain = new MarkovChain(Convert.ToInt32(fileCmd.GetOptionValue("m", "3"))); } //Attempts to automatically load file name try { data.urlTrie = (WebTrie)objSaveUtils.LoadObject("visitedTrie_" + saveFile, typeof(WebTrie)); data.assistTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie)); data.mediaTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie)); } catch (Exception) { //Generate tries if not loadable data.urlTrie = new WebTrie(); data.assistTrie = new WebTrie(); data.mediaTrie = new WebTrie(); } } data.outputFolder = fileCmd.GetOptionValue("o", getAppFolder()) + "CrawlResults\\"; //Database options CommandLine dbCmd = parser.Parse(options, args); if (dbCmd.HasOption("dbip")) { TagDBDriver.instantiateDB(dbCmd.GetOptionValue("dbip")); data.dataBaseImages = dbCmd.HasOption("dbi"); data.dataBaseLinks = dbCmd.HasOption("dbl"); data.dataBaseCheck = dbCmd.HasOption("dbc"); } //Data processing options CommandLine dpCmd = parser.Parse(options, args); printMarkov = dpCmd.HasOption("mp"); markovSentences = Convert.ToInt32(dpCmd.GetOptionValue("mp", "0")); if (helpCmd.HasOption("h") || args.Length == 0) { printHelp(); return; } } catch (Exception exception) { Console.WriteLine("Invalid arguments or parameters. use -h for help (" + exception + ")"); return; } //instantiates trie //creates regex for site locking if (crawlLocked) { string regexURL = Regex.Replace(args[0], "https?://", ""); data.crawlDomain = "https?://([^/.]+[.])*" + regexURL + "(.*)"; } else { data.crawlDomain = ".*"; } try { Crawl(args[0], data); } catch (Exception e) { Console.WriteLine("Scan aborted: " + e); } // System.exit(0); }
public DownloadManager(CrawlStruct data) { DownloadManager.data = data; webStringUtils = new WebStringUtils(data.outputFolder); saveQueue = new SaveQueue(); }