/// <summary> /// Recursively builds the HtmlDocument tree /// </summary> /// <param name="tree"></param> /// <param name="hashedDocs"></param> /// <param name="LinkPath"></param> /// <param name="rootURL"></param> private static HtmlDocumentTree createHtmlDocTreeSubroutine(HtmlDocumentTree tree, List <string> hashedDocs, string LinkPath, string rootURL) { List <string> innerLink = ScraperUtilities.getInnerLinks(tree.Node, LinkPath, rootURL); List <HtmlDocumentTree> childrenToAdd = new List <HtmlDocumentTree>(); foreach (string link in innerLink) { //string docHash = getHash(doc); if (!hashedDocs.Contains(link)) { hashedDocs.Add(link); Task <string> task = ScraperUtilities.AsyncUrlToTask(link); task.Wait(); var doc = new HtmlDocument(); doc.LoadHtml(task.Result); childrenToAdd.Add(new HtmlDocumentTree(doc, link)); } } foreach (HtmlDocumentTree child in childrenToAdd) { tree.ChildDocuments.Add(child); } foreach (HtmlDocumentTree child in childrenToAdd) { return(createHtmlDocTreeSubroutine(child, hashedDocs, LinkPath, rootURL)); } return(tree); }
/// <summary> /// extracts the given zip file and loads its contents into the database /// </summary> /// <param name="zipStream"></param> /// <param name="fileType"></param> /// <returns></returns> void extractAndLoadZipIntoDatabase(MemoryStream zipStream, string fileType, string downloadURL) { Stream unzippedEntryStream; //Unzipped data from a file in the archive ZipArchive archive = new ZipArchive(zipStream); bool lookingForFirstBook = true; //adds txt files to list of streams foreach (ZipArchiveEntry entry in archive.Entries) { try { if (entry.FullName.EndsWith(fileType, StringComparison.OrdinalIgnoreCase) && lookingForFirstBook) { unzippedEntryStream = entry.Open(); // .Open will return a stream byte[] byteArray = ReadFully(unzippedEntryStream); //converts stream to byte array string bookName = getTitle(byteArray); DateTime sqlDate = DateTime.Now; ScraperUtilities.addCorpusContent(bookName, "text", this.m_guid, this.GetType().FullName, sqlDate, downloadURL, byteArray, m_context, m_corpusId); unzippedEntryStream.Dispose(); lookingForFirstBook = false; } } catch { } //ignore invalid files } }
/// <summary> /// new method to clean up get method in controller /// </summary> /// <returns></returns> public string RunDisplay() { // string baseUrl = "http://debian.osuosl.org/debian/pool/main/c/"; Task <string> task = ScraperUtilities.AsyncUrlToTask(rootURL); task.Wait(); var testDoc = new HtmlDocument(); testDoc.LoadHtml(task.Result); htmlDocumentTree = ScraperUtilities.createHtmlDocTree(testDoc, rootURL, LinkPath, rootURL); //List<string> htmlDocumentHashes = new List<string>(); //htmlDocumentHashes.Add(getHash(testDoc)); //HtmlDocumentTree tree = createHtmlDocTree(new HtmlDocumentTree(testDoc, rootURL), htmlDocumentHashes); string result = ""; //result += tree.ChildDocuments.Count.ToString() + "\n"; result += ScraperUtilities.displayHtmlDocumentTree(htmlDocumentTree); // result = displayAllUrls(testDoc); //result += displayInnerUrls(testDoc); //result += getFiles(testDoc); result = ""; //this just lists all the urls foreach (var x in urlList) { result += x + "\n"; } return(result); }
/// <summary> /// sets up conditions for twitter stream and runs the stream /// </summary> void StartTwitterStream() { List <ITweet> tweetList = new List <ITweet>(); // Enable Automatic RateLimit handling RateLimit.RateLimitTrackerMode = RateLimitTrackerMode.TrackAndAwait; var stream = Stream.CreateSampleStream(); stream.StallWarnings = true; stream.AddTweetLanguageFilter(LanguageFilter.English); stream.FilterLevel = Tweetinvi.Streaming.Parameters.StreamFilterLevel.None; bool auth = Authorized; m_downloadCount = 0; m_timer.Reset(); //bool downloadLimitReached = downloadStop(); //bool timeLimitReached = timeStop(); m_timer.Start(); //have to add this before startign the stream, tells us what to do //each time we recieve a tweet stream.TweetReceived += (sender, args) => { ITweet tweet = args.Tweet; //Debug.Assert(false); try { Debug.WriteLine(tweet); Console.WriteLine(tweet); string title = getTweetName(tweet); ScraperUtilities.addCorpusContent(tweet.Text, "tweet", this.Guid, this.GetType().FullName, tweet, this.m_context, m_corpusId); m_downloadCount++; m_progress = (float)m_downloadCount / m_downloadLimit; if (timeStop() || downloadStop()) { StopTwitterStream(stream); } } catch { StopTwitterStream(stream); } }; //start the stream, now that we know what to do with it if (m_authorized) { stream.StartStream(); } else { m_status = "No Twitter Authorization"; //string consumerKey = "GzWUY0oTfH4AMZdnMqrm0wcde"; //string consumerSecret = "QfuQ7YgmLTmvQguuw3siKrwzPCiQ9EW7NleCvhxdRrjSKhfZww"; //UserAuthentication(consumerKey, consumerSecret); //StartTwitterStream(); } }