private static void printHelp()
 {
     //helpFormatter.PrintHelp("java -jar [JARNAME] [URL]", helpHeader, options, helpFooter, true);
     Console.WriteLine("SKS Web crawler/info extractor v0.1");
     Console.Write("Required tags for their category are in "); CU.WCol("red", CU.r); Console.Write(" and recommended tags are in "); CU.WCol("yellow", CU.y);
     Console.Write(CU.nl + CU.nl + "Enter the URL followed by tags. "); CU.WCol("Even if you aren't doing a scan, you need a URL", CU.r);
     Console.WriteLine();
     Console.Write(CU.nl + "Help options:");
     Console.Write(CU.nl + "-h, --help                  display this help dialog");
     Console.Write(CU.nl + "-ph,--pattern help          help with regex patterns for search limiting");
     Console.WriteLine();
     Console.Write(CU.nl + "Crawl Options:");
     Console.Write(CU.nl + "-bc,--backcrawl             deep copy, enables discovery of hidden pages (slow)");
     CU.WCol(CU.nl + "-c,--courtesy <int>", CU.y); Console.Write("         delay between page loads, in milliseconds");
     CU.WCol(CU.nl + "-d,--depth <int>", CU.y); Console.Write("            depth of the search (default 10)");
     CU.WCol(CU.nl + "-p,--pattern <regex>", CU.y); Console.Write("        regex pattern for restricting pages");
     CU.WCol(CU.nl + "-t,--threads <int>", CU.y); Console.Write("          number of allowed threads. More threads = more aggressive (must be 2+)");
     Console.Write(CU.nl + "-ul,--unlock                unlocks crawler from target domain");
     Console.Write(CU.nl + "-i,--iterative <int>        scans urls iteratively in the form of url/1,2.. starting at <param>");
     Console.Write(CU.nl + "-id,--iterative depth <int> Depth to scan to at each step of the iteration");
     Console.Write(CU.nl + "-dc,--dont crawl            do not execute crawl. (for the purpose of using other utilities only)");
     Console.WriteLine();
     Console.Write(CU.nl + "File Options:");
     Console.Write(CU.nl + "-di,--images <regex>        download images while crawling (takes regex for filtering)");
     Console.Write(CU.nl + "-dt,--text <tag, regex>     download text bodies from <tag> for analyzation, if the page matches <regex>");
     Console.Write(CU.nl + "-g,--gallery                only download files to one folder");
     Console.Write(CU.nl + "-ddh,--HTML                 don't download HTML while crawling");
     Console.Write(CU.nl + "-il,--include link          include links to the parent page in text files");
     //CU.WCol(CU.nl + "-l,--load <filename>", CU.y); Console.Write("        load data from previous scan, named <filename> ");
     CU.WCol(CU.nl + "-o,--output <dir>", CU.y); Console.Write("           output location (defaults to exe location)");
     Console.Write(CU.nl + "-O,--overwrite              overwrite files when scan starts");
     Console.WriteLine();
     Console.Write(CU.nl + "Database Options:");
     CU.WCol(CU.nl + "-dbip,--database ip <ip>", CU.r); Console.Write("    the IP address of the database to dump to");
     Console.Write(CU.nl + "-dbc,--database check       Check the database to prevent duplicate entries "); CU.WCol("(Slow and expensive)", CU.y);
     Console.Write(CU.nl + "-dbi,--database images      Save image locations to database, with tags defined by -dt");
     Console.Write(CU.nl + "-dbl,--database links       Save visited links into the DB, with tags defined by -dt");
     Console.WriteLine();
     Console.Write(CU.nl + "Data processing Options:");
     Console.Write(CU.nl + "-m,--markov <int>           generate a markov chain of <int> prefix Length and saves it.");
     Console.Write(CU.nl + "-mp,--print markov <int>    prints out <int> sentences from the chain (Must use -g)");
     Console.WriteLine();
     Console.Write(CU.nl + "Output Options:");
     Console.Write(CU.nl + "-v,--verbose                verbose mode");
     Console.Write(CU.nl + "-vi,--visited               print visited pages a after completion (n.i.);");
     Console.WriteLine();
     Console.Write(CU.nl + "Example usages:");
     Console.Write(CU.nl + "Basic scan: java -jar [JARNAME] http://examplesite.com -di -d 5");
     Console.Write(CU.nl + "Site Image Gallery: [URL] -di -ddh -g");
     Console.Write(CU.nl + "Fullsize gallery of 4chan thread: [URL] -di ^((?!s.).)*$ -ddh -g -p .*/((?!#[spq]).)*");
     Console.Write(CU.nl + "Booru tags on posts with urls: [URL] -g -il -ddh -dt title (.*)(/post/show/)(.*) -O -c 1000 -d 3");
     Console.Write(CU.nl + "Iterative booru tag crawl: [BASEURL] -g -il -ddh -dt title -O -c 1000 -d 1000 -i <startpage>");
     Console.Write(CU.nl + "Markov chain from 4chan board: [URL] -t 10 -d 15 -dt .post .* -m 2 -g -ddh -O -mp 40");
     Console.Write(CU.nl + "Insert images into database with tags: [BOORUURL] -g -t 10 -di .*[/](_images/).* -ddh -d 10 -O -p .*[/]post/.* -ul -dt title -dbi)" + CU.nl);
 }
        public object LoadObject(string name, Type type)
        {
            object objOut   = new object();
            string fileName = baseFolder + name + ".xml";

            DataContractSerializer serializer = new DataContractSerializer(type);

            using (XmlReader reader = XmlReader.Create(fileName)) {
                objOut = serializer.ReadObject(reader);
            }
            CU.WCol(fileName + " loaded successfully." + CU.nl, CU.g);
            return(objOut);
        }
        public void SaveObject(string name, object objIn, bool overWrite)
        {
            int    fileVersion = 1;
            string fileName    = baseFolder + name + ".xml";

            //Overwrite prevention- ex: file, file_v1, file_v2, file_v3......
            while (File.Exists(baseFolder + name) && !overWrite)
            {
                fileVersion++;
                fileName = baseFolder + name + "_v " + fileVersion;
            }

            DataContractSerializer serializer = new DataContractSerializer(objIn.GetType());

            using (XmlWriter writer = XmlWriter.Create(fileName)) {
                serializer.WriteObject(writer, objIn);
            }

            CU.WCol(fileName + " saved successfully." + CU.nl, CU.g);
        }
Beispiel #4
0
 public int Run()
 {
     Console.WriteLine(threadName + " worker started!");
     try {
         //Executes iterative scan
         if (data.iterative)
         {
             for (int i = 0; i < data.maxDepth; i++)
             {
                 iteratorLocation++;
                 //Mod this into a more-than-2-thread operation later
                 if (sidedness)
                 {
                     while (iteratorLocation % 2 == 0)
                     {
                         iteratorLocation++;
                     }
                 }
                 else
                 {
                     while (iteratorLocation % 2 != 0)
                     {
                         iteratorLocation++;
                     }
                 }
                 pagesCrawled += Crawl(startURL + iteratorLocation, 0, sidedness);
             }
         }
         else
         {
             pagesCrawled = Crawl(workingURL, 0, sidedness);
         }
     } catch (Exception e) {
         CU.WCol(CU.nl + "something went terribly wrong, crawler died  " + e, CU.r, CU.y);
     }
     isFinished = true;
     Console.WriteLine(threadName + " finished");
     return(pagesCrawled);
 }
        //Saves all elements in argument
        public void DownloadElements(Element[] elements, FileInfo[] fileInfo = null)
        {
            int totalDownloaded = 0;
            int totalBounced    = 0;

            //Returns if there are no files to be downloaded
            if (fileInfo != null && fileInfo.Length == 0)
            {
                return;
            }

            for (int i = 0; i < elements.Length; i++)
            {
                Element content = elements[i];

                string absURL;

                string tag = content.Tag.ToString();

                switch (tag)
                {
                case "img":
                    absURL = content.AbsUrl("src");
                    break;

                case "a":
                    absURL = content.AbsUrl("href");
                    break;

                default:
                    absURL = content.AbsUrl("src");
                    break;
                }
                data.mediaTrie.InsertURL(absURL);
                FileInfo file;
                //Doesn't recaculate file info if it doesn't have to
                if (fileInfo == null)
                {
                    int nameIndex = absURL.LastIndexOf('/');
                    //Name of the element
                    string elementName = Regex.Replace(absURL.Substring(nameIndex + 1), "[^A-Za-z.]", "");
                    //File location of the element
                    string elementLocation = absURL.Substring(0, nameIndex);
                    if (elementName.Length > 20)
                    {
                        elementName = elementName.Substring(elementName.Length - 20);
                    }
                    //Inserts hash into filename to avoid duplicates
                    string hashCode = Convert.ToString(content.GetHashCode());
                    elementName = elementName.Insert(0, hashCode);
                    if (!data.gallery)
                    {
                        file = new FileInfo(webStringUtils.UrlToDir(elementLocation) + elementName);
                    }
                    else
                    {
                        file = new FileInfo(data.outputFolder + elementName);
                    }
                }
                else
                {
                    file = fileInfo[i];
                }


                //Defers downloading to the saver
                Save(absURL, file);
                //Sleeps to slow down image requests
                Thread.Sleep(data.delay);
                totalDownloaded++;
            }
            string report = "Downloaded " + totalDownloaded + " media files, denied " + totalBounced;

            CU.WCol(CU.nl + report + CU.nl, CU.c);
        }
Beispiel #6
0
        //false = left, true = right
        //This should probably be split into two or three classes that inherit
        public int Crawl(string url, int currentDepth, bool sidedness)
        {
            //helpers at their base level do now download content
            bool isHelper = false;
            //Iterative searchers spawn recursors to assist
            bool isIterativeRecursor = false;

            if (data.iterative && !url.Substring(0, url.LastIndexOf('/') + 1).Equals(data.startURL))
            {
                isIterativeRecursor = true;
            }

            int newPages = 0;

            //Early return if url is not in the domain or if it has been previously checked
            if (!url.Equals(startURL))
            {
                //Checks to see if the URL has already been searched or if it's not in the domain- if so, terminate.
                if (!Regex.IsMatch(url, data.crawlPattern) || !Regex.IsMatch(url, data.crawlDomain) || data.urlTrie.contains(url))
                {
                    return(newPages);
                }
                //Bounces via DB if flag is checked
                if (data.dataBaseCheck)
                {
                    Task <bool> isInDB = Task.Run(() => TagDBDriver.entryExists(url));
                    isInDB.Wait();
                    data.urlTrie.InsertURL(url);
                    if (isInDB.Result)
                    {
                        return(newPages);
                    }
                }
                newPages++;
            }
            else if (url.Equals(startURL))
            {
                if (data.urlTrie.contains(url))
                {
                    isHelper = true;
                    //isIterativeRecursor = true;
                }
            }

            //Iterative helpers still check the URLTrie

            if (isIterativeRecursor && (data.urlTrie.contains(url) || !Regex.IsMatch(url, data.crawlPattern)))
            {
                return(newPages);
            }

            data.urlTrie.InsertURL(url);


            //Courtesy delaying
            Thread.Sleep(data.delay);

            if (data.verbose)
            {
                Console.WriteLine("Starting crawl on " + url);
            }

            Document HTMLDoc       = null;
            int      errorIterator = 0;

            while (HTMLDoc == null)
            {
                try {
                    HTMLDoc = getHTML(url);
                } catch (Exception exception) {
                    //Helpers do not increase the iterator
                    if (data.iterative && !isIterativeRecursor)
                    {
                        if (exception.ToString().Contains("404"))
                        {
                            iteratorLocation++;
                            Console.WriteLine("404. Now on page " + iteratorLocation + ". Increasing index....");
                        }
                        else if (exception.ToString().Contains("503"))
                        {
                            iteratorLocation++;
                            Console.WriteLine("503. Now on page " + iteratorLocation + ". Increasing index....");
                        }
                        else
                        {
                            throw exception;
                        }
                    }

                    if (exception.ToString().Contains("429"))
                    {
                        //Handling for rate limit exceptions
                        errorIterator++;
                        if (errorIterator < 1)
                        {
                            Console.WriteLine(exception);
                            Console.WriteLine("Rate limited. waiting...");
                            Thread.Sleep(15000 + data.delay);
                            Console.WriteLine("Retrying...");
                        }
                        else
                        {
                            Console.WriteLine("Continued rate limiting. Thread waiting for one minute and increasing courtesy delay.");
                            Thread.Sleep(60000);
                            data.delay   += 10000;
                            errorIterator = 0;
                        }
                    }
                    else
                    {
                        CU.WCol(CU.nl + "Could not load page. " + url + " : " + exception.Message + CU.nl, CU.r);
                        return(newPages);
                    }
                }
            }
            //Grab links
            Elements links         = HTMLDoc.Select("a[href]");
            int      numberOfLinks = links.Count();
            //Grabs the page title
            string titleString = HTMLDoc.Title;

            if (titleString != null)
            {
                titleString = HTMLDoc.Title;
            }
            else
            {
                titleString = "Untitled";
            }
            if (!data.verbose)
            {
                if (!isHelper)
                {
                    Console.WriteLine(threadName + " Crawling " + url.Truncate(40) + "(" + titleString.Truncate(40) + ")");
                }
                else
                {
                    Console.WriteLine(threadName + " Finishing " + url.Truncate(40) + "(" + titleString.Truncate(40) + ")");
                }
            }
            if (data.verbose)
            {
                Console.WriteLine("Page name: " + titleString);
            }
            //Writes content to file
            try {
                //Refuse files if crawler is helper at level
                if (!isHelper)
                {
                    //Prep information for DB entries
                    FileInfo[] files = null;
                    string[]   tags  = null;
                    //Download HTML
                    if (data.downloadHTML)
                    {
                        Thread downloadHTML = new Thread(() => downloadManager.downloadHTML(HTMLDoc));
                        downloadHTML.Start();
                    }
                    //Download text within specified tags (-dt [tag])
                    if (data.downloadText)
                    {
                        Elements        text         = HTMLDoc.Select(data.textTag);
                        Task <string[]> downloadText = new Task <string[]>(() => downloadManager.downloadText(text));
                        downloadText.Start();
                        tags = downloadText.Result;
                    }
                    //Download images and links to images
                    if (data.downloadImages)
                    {
                        //Checks for links to images
                        Elements imageElements = HTMLDoc.Select("img");
                        if (imageElements != null)
                        {
                            //Append links to images as well

                            foreach (Element element in links)
                            {
                                if (Regex.IsMatch(element.AbsUrl("href"), ".*(.jpg|.png|.gif|.webm)"))
                                {
                                    imageElements.Add(element);
                                }
                            }
                        }
                        Task <FileInfo[]> downloadImages = new Task <FileInfo[]>(() => downloadManager.DownloadElementsReturnNames(imageElements));
                        downloadImages.Start();
                        files = downloadImages.Result;
                    }
                    //Saves image locations to Database
                    if (data.dataBaseImages)
                    {
                        foreach (FileInfo file in files)
                        {
                            new Thread(() => TagDBDriver.insertImageWithTags(file.FullName, tags)).Start();
                        }
                    }
                    //Saves links to Database
                    if (data.dataBaseLinks)
                    {
                        new Thread(() => TagDBDriver.insertImageWithTags(url, tags)).Start();
                    }
                }
            } catch (Exception e) {
                Console.WriteLine("Could not write to file: " + e);
            }

            //Checks if the search needs to recurse
            if (numberOfLinks <= 0)
            {
                if (data.verbose)
                {
                    Console.WriteLine("No links on page. Going back up...");
                }
                return(newPages);
            }
            //if the crawl is iterative, do not recurse
            try {
                //Recurses the algorithm if not at max depth
                if (currentDepth + 1 > data.maxDepth && !data.iterative)
                {
                    return(newPages);
                }
                //Do shallow recursion while in iterative mode
                else if (currentDepth + 1 >= data.iterativeDepth && isIterativeRecursor)
                {
                    return(newPages);
                }

                if (numberOfLinks > data.linkAssist && !data.assistTrie.contains(url) && !data.iterative)
                {
                    data.assistTrie.InsertURL(url);
                    this.workingURL   = url;
                    this.workingDepth = currentDepth;
                    createHelper();
                }

                //Right-handed search
                //can these be one method?
                int sizeLimit = (int)Math.Round(numberOfLinks / 2f);
                if (sidedness)
                {
                    for (int i = numberOfLinks - 1; i > 0; i--)
                    {
                        //Only search half at the entry depth
                        if (currentDepth == startDepth && i < sizeLimit)
                        {
                            break;
                        }
                        string currentLinkRight = links[i].AbsUrl("href");
                        //Checks to make sure that the URL isn't a page in-reference and that it doesn't link to another part of the page. Also ensures link validity.
                        //Also ignore links to other pages positioned along the iterative crawl
                        if (string.IsNullOrEmpty(currentLinkRight) || currentLinkRight.Equals(url) ||
                            (data.iterative && currentLinkRight.Substring(0, currentLinkRight.LastIndexOf('/') + 1).Equals(data.startURL)) ||
                            (currentLinkRight.Contains('#') && currentLinkRight.Substring(0, currentLinkRight.LastIndexOf('#')).Equals(url)) ||
                            links[i].HasAttr("rel"))
                        {
                            {
                                i--;
                                continue;
                            }
                        }
                        //Ensures the link can be connect to- if not, iterate to the next link
                        try {
                            WebRequest.Create(currentLinkRight);
                        } catch (Exception) {
                            i--;
                            continue;
                        }
                        newPages += Crawl(currentLinkRight, currentDepth + 1, sidedness);
                    }
                } //Left-handed search
                else
                {
                    for (int i = 0; i < numberOfLinks - 1; i++)
                    {
                        if (currentDepth == startDepth && i > sizeLimit)
                        {
                            break;
                        }
                        string currentLinkLeft = links[i].AbsUrl("href");
                        string test            = currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('/') + 1);
                        if (string.IsNullOrEmpty(currentLinkLeft) || currentLinkLeft.Equals(url) ||
                            (data.iterative && currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('/') + 1).Equals(data.startURL)) ||
                            (currentLinkLeft.Contains('#') && currentLinkLeft.Substring(0, currentLinkLeft.LastIndexOf('#')).Equals(url)) ||
                            links[i].HasAttr("rel"))
                        {
                            i++;
                            continue;
                        }
                        try {
                            WebRequest.Create(currentLinkLeft);
                        } catch (Exception) {
                            i++;
                            continue;
                        }
                        newPages += Crawl(currentLinkLeft, currentDepth + 1, sidedness);
                    }
                }


                //Backcrawl to hit missed directoies at the level
                if (data.backCrawl)
                {
                    while (url.Substring(8).Contains("/"))
                    {
                        Console.WriteLine("Backcrawling unfound urls...");
                        Crawl(url = url.Substring(0, url.LastIndexOf('/') - 1), currentDepth - 1, sidedness);
                    }
                }
            } catch (Exception e) {
                CU.WCol(CU.nl + "Dead page: " + e, CU.r, CU.y);
                CU.WCol(CU.nl + e.StackTrace, CU.r, CU.y);
                Console.WriteLine("Now checking depth " + currentDepth + ", link: " + url);
            }
            return(newPages);
        }