Пример #1
0
        /// <summary>
        /// Recursively builds the HtmlDocument tree
        /// </summary>
        /// <param name="tree"></param>
        /// <param name="hashedDocs"></param>
        /// <param name="LinkPath"></param>
        /// <param name="rootURL"></param>
        private static HtmlDocumentTree createHtmlDocTreeSubroutine(HtmlDocumentTree tree, List <string> hashedDocs, string LinkPath, string rootURL)
        {
            List <string>           innerLink     = ScraperUtilities.getInnerLinks(tree.Node, LinkPath, rootURL);
            List <HtmlDocumentTree> childrenToAdd = new List <HtmlDocumentTree>();

            foreach (string link in innerLink)
            {
                //string docHash = getHash(doc);
                if (!hashedDocs.Contains(link))
                {
                    hashedDocs.Add(link);
                    Task <string> task = ScraperUtilities.AsyncUrlToTask(link);
                    task.Wait();
                    var doc = new HtmlDocument();
                    doc.LoadHtml(task.Result);
                    childrenToAdd.Add(new HtmlDocumentTree(doc, link));
                }
            }
            foreach (HtmlDocumentTree child in childrenToAdd)
            {
                tree.ChildDocuments.Add(child);
            }
            foreach (HtmlDocumentTree child in childrenToAdd)
            {
                return(createHtmlDocTreeSubroutine(child, hashedDocs, LinkPath, rootURL));
            }
            return(tree);
        }
Пример #2
0
        /// <summary>
        /// extracts the given zip file and loads its contents into the database
        /// </summary>
        /// <param name="zipStream"></param>
        /// <param name="fileType"></param>
        /// <returns></returns>
        void extractAndLoadZipIntoDatabase(MemoryStream zipStream, string fileType, string downloadURL)
        {
            Stream     unzippedEntryStream; //Unzipped data from a file in the archive
            ZipArchive archive             = new ZipArchive(zipStream);
            bool       lookingForFirstBook = true;

            //adds txt files to list of streams
            foreach (ZipArchiveEntry entry in archive.Entries)
            {
                try
                {
                    if (entry.FullName.EndsWith(fileType, StringComparison.OrdinalIgnoreCase) && lookingForFirstBook)
                    {
                        unzippedEntryStream = entry.Open();                // .Open will return a stream
                        byte[] byteArray = ReadFully(unzippedEntryStream); //converts stream to byte array

                        string bookName = getTitle(byteArray);

                        DateTime sqlDate = DateTime.Now;
                        ScraperUtilities.addCorpusContent(bookName, "text",
                                                          this.m_guid, this.GetType().FullName, sqlDate, downloadURL,
                                                          byteArray, m_context, m_corpusId);


                        unzippedEntryStream.Dispose();
                        lookingForFirstBook = false;
                    }
                }
                catch { } //ignore invalid files
            }
        }
Пример #3
0
        /// <summary>
        /// new method to clean up get method in controller
        /// </summary>
        /// <returns></returns>
        public string RunDisplay()
        {
            //   string baseUrl = "http://debian.osuosl.org/debian/pool/main/c/";
            Task <string> task = ScraperUtilities.AsyncUrlToTask(rootURL);

            task.Wait();

            var testDoc = new HtmlDocument();

            testDoc.LoadHtml(task.Result);
            htmlDocumentTree = ScraperUtilities.createHtmlDocTree(testDoc, rootURL, LinkPath, rootURL);
            //List<string> htmlDocumentHashes = new List<string>();
            //htmlDocumentHashes.Add(getHash(testDoc));
            //HtmlDocumentTree tree = createHtmlDocTree(new HtmlDocumentTree(testDoc, rootURL), htmlDocumentHashes);
            string result = "";

            //result += tree.ChildDocuments.Count.ToString() + "\n";
            result += ScraperUtilities.displayHtmlDocumentTree(htmlDocumentTree);
            // result = displayAllUrls(testDoc);
            //result += displayInnerUrls(testDoc);
            //result += getFiles(testDoc);

            result = "";

            //this just lists all the urls
            foreach (var x in urlList)
            {
                result += x + "\n";
            }

            return(result);
        }
Пример #4
0
        /// <summary>
        /// sets up conditions for twitter stream and runs the stream
        /// </summary>
        void StartTwitterStream()
        {
            List <ITweet> tweetList = new List <ITweet>();

            // Enable Automatic RateLimit handling
            RateLimit.RateLimitTrackerMode = RateLimitTrackerMode.TrackAndAwait;
            var stream = Stream.CreateSampleStream();

            stream.StallWarnings = true;
            stream.AddTweetLanguageFilter(LanguageFilter.English);
            stream.FilterLevel = Tweetinvi.Streaming.Parameters.StreamFilterLevel.None;
            bool auth = Authorized;

            m_downloadCount = 0;
            m_timer.Reset();
            //bool downloadLimitReached = downloadStop();
            //bool timeLimitReached = timeStop();
            m_timer.Start();

            //have to add this before startign the stream, tells us what to do
            //each time we recieve a tweet
            stream.TweetReceived += (sender, args) =>
            {
                ITweet tweet = args.Tweet;
                //Debug.Assert(false);
                try
                {
                    Debug.WriteLine(tweet);
                    Console.WriteLine(tweet);
                    string title = getTweetName(tweet);
                    ScraperUtilities.addCorpusContent(tweet.Text, "tweet", this.Guid,
                                                      this.GetType().FullName, tweet, this.m_context, m_corpusId);
                    m_downloadCount++;
                    m_progress = (float)m_downloadCount / m_downloadLimit;
                    if (timeStop() || downloadStop())
                    {
                        StopTwitterStream(stream);
                    }
                }
                catch
                {
                    StopTwitterStream(stream);
                }
            };

            //start the stream, now that we know what to do with it
            if (m_authorized)
            {
                stream.StartStream();
            }
            else
            {
                m_status = "No Twitter Authorization";
                //string consumerKey = "GzWUY0oTfH4AMZdnMqrm0wcde";
                //string consumerSecret = "QfuQ7YgmLTmvQguuw3siKrwzPCiQ9EW7NleCvhxdRrjSKhfZww";
                //UserAuthentication(consumerKey, consumerSecret);
                //StartTwitterStream();
            }
        }