static void Main(string[] args) { //Handles early exits AppDomain.CurrentDomain.ProcessExit += new EventHandler(OnProcessExit); //Loads embedded DLLs into EXE if they don't load automatically AppDomain.CurrentDomain.AssemblyResolve += (sender, arguments) => { string resourceName = "AssemblyLoadingAndReflection." + new AssemblyName(arguments.Name).Name + ".dll"; using (var stream = Assembly.GetExecutingAssembly() .GetManifestResourceStream(resourceName)) { byte[] assemblyData = new byte[stream.Length]; stream.Read(assemblyData, 0, assemblyData.Length); return(Assembly.Load(assemblyData)); } }; //Application initialization webStringUtils = new WebStringUtils(getAppFolder()); objSaveUtils = new ObjSaveUtils(getAppFolder() + '/'); //Help //Options options = new Options(); Options options = new Options(); options.AddOption(new Option("h", "help", false, "display this help dialog")); options.AddOption(new Option("ph", "pattern help", false, "help with the -p command")); options.AddOption(new Option("v", "verbose", false, "verbose mode")); //options.AddOptionGroup(helpOptions); //Crawl options options.AddOption(new Option("dc", "dont crawl", false, "do not execute crawl. (for the purpose of using other utilities only)")); options.AddOption(new Option("vi", "visited", false, "print visited pages a after completion (n.i.)")); options.AddOption(new Option("ul", "unlock", false, "unlocks crawler from target domain")); options.AddOption(new Option("bc", "backcrawl", false, "deep copy, enables discovery of hidden pages (slow)")); options.AddOption(new Option("p", "pattern", true, "regex pattern for restricting pages")); options.AddOption(new Option("d", "depth", true, "depth of the search (default 10)")); options.AddOption(new Option("c", "courtesy", true, "delay between page loads, in milliseconds")); options.AddOption(new Option("t", "threads", true, "number of allowed threads. More threads = more aggressive (must be 2+)")); options.AddOption(new Option("i", "iterative", true, "scans urls iteratively in the form of url/1,2.. starting at <param>")); options.AddOption(new Option("id", "iterative depth", true, "Depth to scan to at each step of the iteration")); //File options options.AddOption(new Option("O", "overwrite", false, "overwrite files when scan starts")); Option downloadImages = new Option("di", "images", true, "download images while crawling (takes regex for filtering)"); downloadImages.OptionalArg = true; downloadImages.NumberOfArgs = 1; options.AddOption(downloadImages); Option downloadText = new Option("dt", "text", false, "download text bodies for analyzation <tag, regex>"); downloadText.OptionalArg = true; downloadText.NumberOfArgs = 2; downloadText.ValueSeparator = ' '; options.AddOption(downloadText); options.AddOption(new Option("il", "include link", false, "include links to the parent page in text files")); options.AddOption(new Option("g", "gallery", false, "only download files to one folder")); options.AddOption(new Option("o", "output", true, "output location (defaults to exe location)")); options.AddOption(new Option("l", "load", true, "load data from previous scan, named <param>")); //Database options options.AddOption(new Option("dbl", "database links", false, "Save visited links into the DB, with tags defined by -dt")); options.AddOption(new Option("dbi", "database images", false, "Save image locations to database, with tags defined by -dt")); options.AddOption(new Option("ddh", "HTML", false, "don't download HTML while crawling")); options.AddOption(new Option("dbc", "database check", false, "Check the database to prevent duplicate entries (slow)")); options.AddOption(new Option("dbip", "database ip", true, "the IP address of the database to dump to")); //Data processing options.AddOption(new Option("m", "markov", true, "generate a markov chain of <param> prefix Length and saves it.")); options.AddOption(new Option("mp", "print markov", true, "prints out [param] sentences from the chain (Must use -g)")); //Attempts to parse args try { ICommandLineParser parser = new PosixParser(); //Help options CommandLine helpCmd = parser.Parse(options, args); HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.Width = 100; helpFormatter.DescPadding = 0x1; //string helpHeader = "\nSKS Web crawler/info extractor v0.1"; string helpHeader = "\nSKS Web crawler/info extractor v0.1"; string helpFooter = "\nExample Usage: java -jar [JARNAME] http://pornhub.com -di -d 5" + "\nSite Image Gallery: [URL] -di -ddh -g" + "\nFullsize gallery of 4chan thread: [URL] -di ^((?!s.).)*$ -ddh -g -p .*/((?!#[spq]).)*" + "\nSankaku tags on posts with urls: [URL] -g -il -ddh -dt title (.*)(/post/show/)(.*) -O -c 1000 -d 3" + "\nIterative booru tag crawl: [BASEURL] -g -il -ddh -dt title -O -c 1000 -d 1000 -i <startpage>" + "\nMarkov chain from 4chan board: [URL] -t 10 -d 15 -dt .post .* -m 2 -g -ddh -O -mp 40" + "\nInsert images into database with tags: [BOORUURL] -g -t 10 -di .*[/](_images/).* -ddh -d 10 -O -p .*[/]post/.* -ul -dt title -dbi"; if (helpCmd.HasOption("ph")) { Console.WriteLine("\n-p and -i take a regular exp. as an argument, searching all URLs" + "\nthat match the pattern. I.E., \"test.com/page \" would " + "\nmatch \"test.com/page/page2\". To test for any subdomain," + "\nthe following pattern would operate on [anything].test.com:" + "\nhttps?://([^/.]+[.])*test.com(.*)"); return; } data.verbose = helpCmd.HasOption("v"); //Crawl options CommandLine crawlCmd = parser.Parse(options, args); if (args.Length > 0) { data.startURL = args[0]; } data.backCrawl = crawlCmd.HasOption("bc"); data.iterative = crawlCmd.HasOption("i"); shouldCrawl = !crawlCmd.HasOption("dc"); data.iteratorStart = Convert.ToInt32(crawlCmd.GetOptionValue("i", "0")); data.iterativeDepth = Convert.ToInt32(crawlCmd.GetOptionValue("id", "0")); data.crawlPattern = crawlCmd.GetOptionValue("p", ".*"); data.maxDepth = Convert.ToInt32(crawlCmd.GetOptionValue("d", "5")); data.delay = Convert.ToInt32(crawlCmd.GetOptionValue("c", "0")); crawlThreadExecutor = new LimitedConcurrencyLevelTaskScheduler(Convert.ToInt32(crawlCmd.GetOptionValue("t", "2"))); crawlThreadFactory = new TaskFactory(crawlThreadExecutor); crawlLocked = !crawlCmd.HasOption("ul"); //File options CommandLine fileCmd = parser.Parse(options, args); data.overwrite = fileCmd.HasOption("O"); data.downloadImages = fileCmd.HasOption("di"); data.imagePattern = fileCmd.GetOptionValue("di", ""); data.downloadText = fileCmd.HasOption("dt"); data.downloadHTML = !fileCmd.HasOption("ddh"); data.gallery = fileCmd.HasOption("g"); if (data.downloadText) { string[] imageOptions = fileCmd.GetOptionValues("dt"); //textTag = cmd.GetOptionValue("dt", null); data.textTag = imageOptions[0]; try { data.textPattern = imageOptions[1]; } catch (Exception) { data.textPattern = ""; } data.includeLinks = fileCmd.HasOption("il"); } if (fileCmd.HasOption("l")) { saveFile = fileCmd.GetOptionValue("l"); //Loads the chain if (fileCmd.HasOption("m")) { markovChain = (MarkovChain)objSaveUtils.LoadObject("markov_" + saveFile, typeof(MarkovChain)); } //Loads the tries data.urlTrie = (WebTrie)objSaveUtils.LoadObject("visitedTrie_" + saveFile, typeof(WebTrie)); data.assistTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie)); data.mediaTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie)); } else { if (args.Length > 0) { saveFile = webStringUtils.UnFuck(args[0]); } //If not loading chain from file, create new chain if (fileCmd.HasOption("m")) { markovChain = new MarkovChain(Convert.ToInt32(fileCmd.GetOptionValue("m", "3"))); } //Attempts to automatically load file name try { data.urlTrie = (WebTrie)objSaveUtils.LoadObject("visitedTrie_" + saveFile, typeof(WebTrie)); data.assistTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie)); data.mediaTrie = (WebTrie)objSaveUtils.LoadObject("assistTrie_" + saveFile, typeof(WebTrie)); } catch (Exception) { //Generate tries if not loadable data.urlTrie = new WebTrie(); data.assistTrie = new WebTrie(); data.mediaTrie = new WebTrie(); } } data.outputFolder = fileCmd.GetOptionValue("o", getAppFolder()) + "CrawlResults\\"; //Database options CommandLine dbCmd = parser.Parse(options, args); if (dbCmd.HasOption("dbip")) { TagDBDriver.instantiateDB(dbCmd.GetOptionValue("dbip")); data.dataBaseImages = dbCmd.HasOption("dbi"); data.dataBaseLinks = dbCmd.HasOption("dbl"); data.dataBaseCheck = dbCmd.HasOption("dbc"); } //Data processing options CommandLine dpCmd = parser.Parse(options, args); printMarkov = dpCmd.HasOption("mp"); markovSentences = Convert.ToInt32(dpCmd.GetOptionValue("mp", "0")); if (helpCmd.HasOption("h") || args.Length == 0) { printHelp(); return; } } catch (Exception exception) { Console.WriteLine("Invalid arguments or parameters. use -h for help (" + exception + ")"); return; } //instantiates trie //creates regex for site locking if (crawlLocked) { string regexURL = Regex.Replace(args[0], "https?://", ""); data.crawlDomain = "https?://([^/.]+[.])*" + regexURL + "(.*)"; } else { data.crawlDomain = ".*"; } try { Crawl(args[0], data); } catch (Exception e) { Console.WriteLine("Scan aborted: " + e); } // System.exit(0); }
//false = left, true = right //This should probably be split into two or three classes that inherit public int Crawl(string url, int currentDepth, bool sidedness) { //helpers at their base level do now download content bool isHelper = false; //Iterative searchers spawn recursors to assist bool isIterativeRecursor = false; if (data.iterative && !url.Substring(0, url.LastIndexOf('/') + 1).Equals(data.startURL)) { isIterativeRecursor = true; } int newPages = 0; //Early return if url is not in the domain or if it has been previously checked if (!url.Equals(startURL)) { //Checks to see if the URL has already been searched or if it's not in the domain- if so, terminate. if (!Regex.IsMatch(url, data.crawlPattern) || !Regex.IsMatch(url, data.crawlDomain) || data.urlTrie.contains(url)) { return(newPages); } //Bounces via DB if flag is checked if (data.dataBaseCheck) { Task <bool> isInDB = Task.Run(() => TagDBDriver.entryExists(url)); isInDB.Wait(); data.urlTrie.InsertURL(url); if (isInDB.Result) { return(newPages); } } newPages++; } else if (url.Equals(startURL)) { if (data.urlTrie.contains(url)) { isHelper = true; //isIterativeRecursor = true; } } //Iterative helpers still check the URLTrie if (isIterativeRecursor && (data.urlTrie.contains(url) || !Regex.IsMatch(url, data.crawlPattern))) { return(newPages); } data.urlTrie.InsertURL(url); //Courtesy delaying Thread.Sleep(data.delay); if (data.verbose) { Console.WriteLine("Starting crawl on " + url); } Document HTMLDoc = null; int errorIterator = 0; while (HTMLDoc == null) { try { HTMLDoc = getHTML(url); } catch (Exception exception) { //Helpers do not increase the iterator if (data.iterative && !isIterativeRecursor) { if (exception.ToString().Contains("404")) { iteratorLocation++; Console.WriteLine("404. Now on page " + iteratorLocation + ". Increasing index...."); } else if (exception.ToString().Contains("503")) { iteratorLocation++; Console.WriteLine("503. Now on page " + iteratorLocation + ". Increasing index...."); } else { throw exception; } } if (exception.ToString().Contains("429")) { //Handling for rate limit exceptions errorIterator++; if (errorIterator < 1) { Console.WriteLine(exception); Console.WriteLine("Rate limited. waiting..."); Thread.Sleep(15000 + data.delay); Console.WriteLine("Retrying..."); } else { Console.WriteLine("Continued rate limiting. Thread waiting for one minute and increasing courtesy delay."); Thread.Sleep(60000); data.delay += 10000; errorIterator = 0; } } else { CU.WCol(CU.nl + "Could not load page. " + url + " : " + exception.Message + CU.nl, CU.r); return(newPages); } } } //Grab links Elements links = HTMLDoc.Select("a[href]"); int numberOfLinks = links.Count(); //Grabs the page title string titleString = HTMLDoc.Title; if (titleString != null) { titleString = HTMLDoc.Title; } else { titleString = "Untitled"; } if (!data.verbose) { if (!isHelper) { Console.WriteLine(threadName + " Crawling " + url.Truncate(40) + "(" + titleString.Truncate(40) + ")"); } else { Console.WriteLine(threadName + " Finishing " + url.Truncate(40) + "(" + titleString.Truncate(40) + ")"); } } if (data.verbose) { Console.WriteLine("Page name: " + titleString); } //Writes content to file try { //Refuse files if crawler is helper at level if (!isHelper) { //Prep information for DB entries FileInfo[] files = null; string[] tags = null; //Download HTML if (data.downloadHTML) { Thread downloadHTML = new Thread(() => downloadManager.downloadHTML(HTMLDoc)); downloadHTML.Start(); } //Download text within specified tags (-dt [tag]) if (data.downloadText) { Elements text = HTMLDoc.Select(data.textTag); Task <string[]> downloadText = new Task <string[]>(() => downloadManager.downloadText(text)); downloadText.Start(); tags = downloadText.Result; } //Download images and links to images if (data.downloadImages) { //Checks for links to images Elements imageElements = HTMLDoc.Select("img"); if (imageElements != null) { //Append links to images as well foreach (Element element in links) { if (Regex.IsMatch(element.AbsUrl("href"), ".*(.jpg|.png|.gif|.webm)")) { imageElements.Add(element); } } } Task <FileInfo[]> downloadImages = new Task <FileInfo[]>(() => downloadManager.DownloadElementsReturnNames(imageElements)); downloadImages.Start(); files = downloadImages.Result; } //Saves image locations to Database if (data.dataBaseImages) { foreach (FileInfo file in files) { new Thread(() => TagDBDriver.insertImageWithTags(file.FullName, tags)).Start(); } } //Saves links to Database if (data.dataBaseLinks) { new Thread(() => TagDBDriver.insertImageWithTags(url, tags)).Start(); } } } catch (Exception e) { Console.WriteLine("Could not write to file: " + e); } //Checks if the search needs to recurse if (numberOfLinks <= 0) { if (data.verbose) { Console.WriteLine("No links on page. Going back up..."); } return(newPages); } //if the crawl is iterative, do not recurse try { //Recurses the algorithm if not at max depth if (currentDepth + 1 > data.maxDepth && !data.iterative) { return(newPages); } //Do shallow recursion while in iterative mode else if (currentDepth + 1 >= data.iterativeDepth && isIterativeRecursor) { return(newPages); } if (numberOfLinks > data.linkAssist && !data.assistTrie.contains(url) && !data.iterative) { data.assistTrie.InsertURL(url); this.workingURL = url; this.workingDepth = currentDepth; createHelper(); } //Right-handed search //can these be one method? int sizeLimit = (int)Math.Round(numberOfLinks / 2f); if (sidedness) { for (int i = numberOfLinks - 1; i > 0; i--) { //Only search half at the entry depth if (currentDepth == startDepth && i < sizeLimit) { break; } string currentLinkRight = links[i].AbsUrl("href"); //Checks to make sure that the URL isn't a page in-reference and that it doesn't link to another part of the page. Also ensures link validity. //Also ignore links to other pages positioned along the iterative crawl if (string.IsNullOrEmpty(currentLinkRight) || currentLinkRight.Equals(url) || (data.iterative && currentLinkRight.Substring(0, currentLinkRight.LastIndexOf('/') + 1).Equals(data.startURL)) || (currentLinkRight.Contains('#') && currentLinkRight.Substring(0, currentLinkRight.LastIndexOf('#')).Equals(url)) || links[i].HasAttr("rel")) { { i--; continue; } } //Ensures the link can be connect to- if not, iterate to the next link try { WebRequest.Create(currentLinkRight); } catch (Exception) { i--; continue; } newPages += Crawl(currentLinkRight, currentDepth + 1, sidedness); } } //Left-handed search else { for (int i = 0; i < numberOfLinks - 1; i++) { if (currentDepth == startDepth && i > sizeLimit) { break; } string currentLinkLeft = links[i].AbsUrl("href"); string test = currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('/') + 1); if (string.IsNullOrEmpty(currentLinkLeft) || currentLinkLeft.Equals(url) || (data.iterative && currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('/') + 1).Equals(data.startURL)) || (currentLinkLeft.Contains('#') && currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('#')).Equals(url)) || links[i].HasAttr("rel")) { i++; continue; } try { WebRequest.Create(currentLinkLeft); } catch (Exception) { i++; continue; } newPages += Crawl(currentLinkLeft, currentDepth + 1, sidedness); } } //Backcrawl to hit missed directoies at the level if (data.backCrawl) { while (url.Substring(8).Contains("/")) { Console.WriteLine("Backcrawling unfound urls..."); Crawl(url = url.Substring(0, url.LastIndexOf('/') - 1), currentDepth - 1, sidedness); } } } catch (Exception e) { CU.WCol(CU.nl + "Dead page: " + e, CU.r, CU.y); CU.WCol(CU.nl + e.StackTrace, CU.r, CU.y); Console.WriteLine("Now checking depth " + currentDepth + ", link: " + url); } return(newPages); }