private static void printHelp() { //helpFormatter.PrintHelp("java -jar [JARNAME] [URL]", helpHeader, options, helpFooter, true); Console.WriteLine("SKS Web crawler/info extractor v0.1"); Console.Write("Required tags for their category are in "); CU.WCol("red", CU.r); Console.Write(" and recommended tags are in "); CU.WCol("yellow", CU.y); Console.Write(CU.nl + CU.nl + "Enter the URL followed by tags. "); CU.WCol("Even if you aren't doing a scan, you need a URL", CU.r); Console.WriteLine(); Console.Write(CU.nl + "Help options:"); Console.Write(CU.nl + "-h, --help display this help dialog"); Console.Write(CU.nl + "-ph,--pattern help help with regex patterns for search limiting"); Console.WriteLine(); Console.Write(CU.nl + "Crawl Options:"); Console.Write(CU.nl + "-bc,--backcrawl deep copy, enables discovery of hidden pages (slow)"); CU.WCol(CU.nl + "-c,--courtesy <int>", CU.y); Console.Write(" delay between page loads, in milliseconds"); CU.WCol(CU.nl + "-d,--depth <int>", CU.y); Console.Write(" depth of the search (default 10)"); CU.WCol(CU.nl + "-p,--pattern <regex>", CU.y); Console.Write(" regex pattern for restricting pages"); CU.WCol(CU.nl + "-t,--threads <int>", CU.y); Console.Write(" number of allowed threads. More threads = more aggressive (must be 2+)"); Console.Write(CU.nl + "-ul,--unlock unlocks crawler from target domain"); Console.Write(CU.nl + "-i,--iterative <int> scans urls iteratively in the form of url/1,2.. starting at <param>"); Console.Write(CU.nl + "-id,--iterative depth <int> Depth to scan to at each step of the iteration"); Console.Write(CU.nl + "-dc,--dont crawl do not execute crawl. (for the purpose of using other utilities only)"); Console.WriteLine(); Console.Write(CU.nl + "File Options:"); Console.Write(CU.nl + "-di,--images <regex> download images while crawling (takes regex for filtering)"); Console.Write(CU.nl + "-dt,--text <tag, regex> download text bodies from <tag> for analyzation, if the page matches <regex>"); Console.Write(CU.nl + "-g,--gallery only download files to one folder"); Console.Write(CU.nl + "-ddh,--HTML don't download HTML while crawling"); Console.Write(CU.nl + "-il,--include link include links to the parent page in text files"); //CU.WCol(CU.nl + "-l,--load <filename>", CU.y); Console.Write(" load data from previous scan, named <filename> "); CU.WCol(CU.nl + "-o,--output <dir>", CU.y); Console.Write(" output location (defaults to exe location)"); Console.Write(CU.nl + "-O,--overwrite overwrite files when scan starts"); Console.WriteLine(); Console.Write(CU.nl + "Database Options:"); CU.WCol(CU.nl + "-dbip,--database ip <ip>", CU.r); Console.Write(" the IP address of the database to dump to"); Console.Write(CU.nl + "-dbc,--database check Check the database to prevent duplicate entries "); CU.WCol("(Slow and expensive)", CU.y); Console.Write(CU.nl + "-dbi,--database images Save image locations to database, with tags defined by -dt"); Console.Write(CU.nl + "-dbl,--database links Save visited links into the DB, with tags defined by -dt"); Console.WriteLine(); Console.Write(CU.nl + "Data processing Options:"); Console.Write(CU.nl + "-m,--markov <int> generate a markov chain of <int> prefix Length and saves it."); Console.Write(CU.nl + "-mp,--print markov <int> prints out <int> sentences from the chain (Must use -g)"); Console.WriteLine(); Console.Write(CU.nl + "Output Options:"); Console.Write(CU.nl + "-v,--verbose verbose mode"); Console.Write(CU.nl + "-vi,--visited print visited pages a after completion (n.i.);"); Console.WriteLine(); Console.Write(CU.nl + "Example usages:"); Console.Write(CU.nl + "Basic scan: java -jar [JARNAME] http://examplesite.com -di -d 5"); Console.Write(CU.nl + "Site Image Gallery: [URL] -di -ddh -g"); Console.Write(CU.nl + "Fullsize gallery of 4chan thread: [URL] -di ^((?!s.).)*$ -ddh -g -p .*/((?!#[spq]).)*"); Console.Write(CU.nl + "Booru tags on posts with urls: [URL] -g -il -ddh -dt title (.*)(/post/show/)(.*) -O -c 1000 -d 3"); Console.Write(CU.nl + "Iterative booru tag crawl: [BASEURL] -g -il -ddh -dt title -O -c 1000 -d 1000 -i <startpage>"); Console.Write(CU.nl + "Markov chain from 4chan board: [URL] -t 10 -d 15 -dt .post .* -m 2 -g -ddh -O -mp 40"); Console.Write(CU.nl + "Insert images into database with tags: [BOORUURL] -g -t 10 -di .*[/](_images/).* -ddh -d 10 -O -p .*[/]post/.* -ul -dt title -dbi)" + CU.nl); }
public object LoadObject(string name, Type type) { object objOut = new object(); string fileName = baseFolder + name + ".xml"; DataContractSerializer serializer = new DataContractSerializer(type); using (XmlReader reader = XmlReader.Create(fileName)) { objOut = serializer.ReadObject(reader); } CU.WCol(fileName + " loaded successfully." + CU.nl, CU.g); return(objOut); }
public void SaveObject(string name, object objIn, bool overWrite) { int fileVersion = 1; string fileName = baseFolder + name + ".xml"; //Overwrite prevention- ex: file, file_v1, file_v2, file_v3...... while (File.Exists(baseFolder + name) && !overWrite) { fileVersion++; fileName = baseFolder + name + "_v " + fileVersion; } DataContractSerializer serializer = new DataContractSerializer(objIn.GetType()); using (XmlWriter writer = XmlWriter.Create(fileName)) { serializer.WriteObject(writer, objIn); } CU.WCol(fileName + " saved successfully." + CU.nl, CU.g); }
public int Run() { Console.WriteLine(threadName + " worker started!"); try { //Executes iterative scan if (data.iterative) { for (int i = 0; i < data.maxDepth; i++) { iteratorLocation++; //Mod this into a more-than-2-thread operation later if (sidedness) { while (iteratorLocation % 2 == 0) { iteratorLocation++; } } else { while (iteratorLocation % 2 != 0) { iteratorLocation++; } } pagesCrawled += Crawl(startURL + iteratorLocation, 0, sidedness); } } else { pagesCrawled = Crawl(workingURL, 0, sidedness); } } catch (Exception e) { CU.WCol(CU.nl + "something went terribly wrong, crawler died " + e, CU.r, CU.y); } isFinished = true; Console.WriteLine(threadName + " finished"); return(pagesCrawled); }
//Saves all elements in argument public void DownloadElements(Element[] elements, FileInfo[] fileInfo = null) { int totalDownloaded = 0; int totalBounced = 0; //Returns if there are no files to be downloaded if (fileInfo != null && fileInfo.Length == 0) { return; } for (int i = 0; i < elements.Length; i++) { Element content = elements[i]; string absURL; string tag = content.Tag.ToString(); switch (tag) { case "img": absURL = content.AbsUrl("src"); break; case "a": absURL = content.AbsUrl("href"); break; default: absURL = content.AbsUrl("src"); break; } data.mediaTrie.InsertURL(absURL); FileInfo file; //Doesn't recaculate file info if it doesn't have to if (fileInfo == null) { int nameIndex = absURL.LastIndexOf('/'); //Name of the element string elementName = Regex.Replace(absURL.Substring(nameIndex + 1), "[^A-Za-z.]", ""); //File location of the element string elementLocation = absURL.Substring(0, nameIndex); if (elementName.Length > 20) { elementName = elementName.Substring(elementName.Length - 20); } //Inserts hash into filename to avoid duplicates string hashCode = Convert.ToString(content.GetHashCode()); elementName = elementName.Insert(0, hashCode); if (!data.gallery) { file = new FileInfo(webStringUtils.UrlToDir(elementLocation) + elementName); } else { file = new FileInfo(data.outputFolder + elementName); } } else { file = fileInfo[i]; } //Defers downloading to the saver Save(absURL, file); //Sleeps to slow down image requests Thread.Sleep(data.delay); totalDownloaded++; } string report = "Downloaded " + totalDownloaded + " media files, denied " + totalBounced; CU.WCol(CU.nl + report + CU.nl, CU.c); }
//false = left, true = right //This should probably be split into two or three classes that inherit public int Crawl(string url, int currentDepth, bool sidedness) { //helpers at their base level do now download content bool isHelper = false; //Iterative searchers spawn recursors to assist bool isIterativeRecursor = false; if (data.iterative && !url.Substring(0, url.LastIndexOf('/') + 1).Equals(data.startURL)) { isIterativeRecursor = true; } int newPages = 0; //Early return if url is not in the domain or if it has been previously checked if (!url.Equals(startURL)) { //Checks to see if the URL has already been searched or if it's not in the domain- if so, terminate. if (!Regex.IsMatch(url, data.crawlPattern) || !Regex.IsMatch(url, data.crawlDomain) || data.urlTrie.contains(url)) { return(newPages); } //Bounces via DB if flag is checked if (data.dataBaseCheck) { Task <bool> isInDB = Task.Run(() => TagDBDriver.entryExists(url)); isInDB.Wait(); data.urlTrie.InsertURL(url); if (isInDB.Result) { return(newPages); } } newPages++; } else if (url.Equals(startURL)) { if (data.urlTrie.contains(url)) { isHelper = true; //isIterativeRecursor = true; } } //Iterative helpers still check the URLTrie if (isIterativeRecursor && (data.urlTrie.contains(url) || !Regex.IsMatch(url, data.crawlPattern))) { return(newPages); } data.urlTrie.InsertURL(url); //Courtesy delaying Thread.Sleep(data.delay); if (data.verbose) { Console.WriteLine("Starting crawl on " + url); } Document HTMLDoc = null; int errorIterator = 0; while (HTMLDoc == null) { try { HTMLDoc = getHTML(url); } catch (Exception exception) { //Helpers do not increase the iterator if (data.iterative && !isIterativeRecursor) { if (exception.ToString().Contains("404")) { iteratorLocation++; Console.WriteLine("404. Now on page " + iteratorLocation + ". Increasing index...."); } else if (exception.ToString().Contains("503")) { iteratorLocation++; Console.WriteLine("503. Now on page " + iteratorLocation + ". Increasing index...."); } else { throw exception; } } if (exception.ToString().Contains("429")) { //Handling for rate limit exceptions errorIterator++; if (errorIterator < 1) { Console.WriteLine(exception); Console.WriteLine("Rate limited. waiting..."); Thread.Sleep(15000 + data.delay); Console.WriteLine("Retrying..."); } else { Console.WriteLine("Continued rate limiting. Thread waiting for one minute and increasing courtesy delay."); Thread.Sleep(60000); data.delay += 10000; errorIterator = 0; } } else { CU.WCol(CU.nl + "Could not load page. " + url + " : " + exception.Message + CU.nl, CU.r); return(newPages); } } } //Grab links Elements links = HTMLDoc.Select("a[href]"); int numberOfLinks = links.Count(); //Grabs the page title string titleString = HTMLDoc.Title; if (titleString != null) { titleString = HTMLDoc.Title; } else { titleString = "Untitled"; } if (!data.verbose) { if (!isHelper) { Console.WriteLine(threadName + " Crawling " + url.Truncate(40) + "(" + titleString.Truncate(40) + ")"); } else { Console.WriteLine(threadName + " Finishing " + url.Truncate(40) + "(" + titleString.Truncate(40) + ")"); } } if (data.verbose) { Console.WriteLine("Page name: " + titleString); } //Writes content to file try { //Refuse files if crawler is helper at level if (!isHelper) { //Prep information for DB entries FileInfo[] files = null; string[] tags = null; //Download HTML if (data.downloadHTML) { Thread downloadHTML = new Thread(() => downloadManager.downloadHTML(HTMLDoc)); downloadHTML.Start(); } //Download text within specified tags (-dt [tag]) if (data.downloadText) { Elements text = HTMLDoc.Select(data.textTag); Task <string[]> downloadText = new Task <string[]>(() => downloadManager.downloadText(text)); downloadText.Start(); tags = downloadText.Result; } //Download images and links to images if (data.downloadImages) { //Checks for links to images Elements imageElements = HTMLDoc.Select("img"); if (imageElements != null) { //Append links to images as well foreach (Element element in links) { if (Regex.IsMatch(element.AbsUrl("href"), ".*(.jpg|.png|.gif|.webm)")) { imageElements.Add(element); } } } Task <FileInfo[]> downloadImages = new Task <FileInfo[]>(() => downloadManager.DownloadElementsReturnNames(imageElements)); downloadImages.Start(); files = downloadImages.Result; } //Saves image locations to Database if (data.dataBaseImages) { foreach (FileInfo file in files) { new Thread(() => TagDBDriver.insertImageWithTags(file.FullName, tags)).Start(); } } //Saves links to Database if (data.dataBaseLinks) { new Thread(() => TagDBDriver.insertImageWithTags(url, tags)).Start(); } } } catch (Exception e) { Console.WriteLine("Could not write to file: " + e); } //Checks if the search needs to recurse if (numberOfLinks <= 0) { if (data.verbose) { Console.WriteLine("No links on page. Going back up..."); } return(newPages); } //if the crawl is iterative, do not recurse try { //Recurses the algorithm if not at max depth if (currentDepth + 1 > data.maxDepth && !data.iterative) { return(newPages); } //Do shallow recursion while in iterative mode else if (currentDepth + 1 >= data.iterativeDepth && isIterativeRecursor) { return(newPages); } if (numberOfLinks > data.linkAssist && !data.assistTrie.contains(url) && !data.iterative) { data.assistTrie.InsertURL(url); this.workingURL = url; this.workingDepth = currentDepth; createHelper(); } //Right-handed search //can these be one method? int sizeLimit = (int)Math.Round(numberOfLinks / 2f); if (sidedness) { for (int i = numberOfLinks - 1; i > 0; i--) { //Only search half at the entry depth if (currentDepth == startDepth && i < sizeLimit) { break; } string currentLinkRight = links[i].AbsUrl("href"); //Checks to make sure that the URL isn't a page in-reference and that it doesn't link to another part of the page. Also ensures link validity. //Also ignore links to other pages positioned along the iterative crawl if (string.IsNullOrEmpty(currentLinkRight) || currentLinkRight.Equals(url) || (data.iterative && currentLinkRight.Substring(0, currentLinkRight.LastIndexOf('/') + 1).Equals(data.startURL)) || (currentLinkRight.Contains('#') && currentLinkRight.Substring(0, currentLinkRight.LastIndexOf('#')).Equals(url)) || links[i].HasAttr("rel")) { { i--; continue; } } //Ensures the link can be connect to- if not, iterate to the next link try { WebRequest.Create(currentLinkRight); } catch (Exception) { i--; continue; } newPages += Crawl(currentLinkRight, currentDepth + 1, sidedness); } } //Left-handed search else { for (int i = 0; i < numberOfLinks - 1; i++) { if (currentDepth == startDepth && i > sizeLimit) { break; } string currentLinkLeft = links[i].AbsUrl("href"); string test = currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('/') + 1); if (string.IsNullOrEmpty(currentLinkLeft) || currentLinkLeft.Equals(url) || (data.iterative && currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('/') + 1).Equals(data.startURL)) || (currentLinkLeft.Contains('#') && currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('#')).Equals(url)) || links[i].HasAttr("rel")) { i++; continue; } try { WebRequest.Create(currentLinkLeft); } catch (Exception) { i++; continue; } newPages += Crawl(currentLinkLeft, currentDepth + 1, sidedness); } } //Backcrawl to hit missed directoies at the level if (data.backCrawl) { while (url.Substring(8).Contains("/")) { Console.WriteLine("Backcrawling unfound urls..."); Crawl(url = url.Substring(0, url.LastIndexOf('/') - 1), currentDepth - 1, sidedness); } } } catch (Exception e) { CU.WCol(CU.nl + "Dead page: " + e, CU.r, CU.y); CU.WCol(CU.nl + e.StackTrace, CU.r, CU.y); Console.WriteLine("Now checking depth " + currentDepth + ", link: " + url); } return(newPages); }