Exemplo n.º 1
0
        WebPageInfo GetWebPageInfo(Uri url, bool loadContent = true)
        {
            WebPageInfo result = null;

            using (var response = GetWebResponse(url, loadContent))
            {
                if (OnWebResponseEvent != null)
                {
                    OnWebResponseEvent(this, new WebResponseEventArgs()
                    {
                        Url      = url,
                        Response = response
                    });
                }
                if (response != null && response.ContentLength > 0)
                {
                    using (var sr = new StreamReader(response.GetResponseStream()))
                    {
                        result = new WebPageInfo(url, Data.UrlType, sr.ReadToEnd(), response.ContentType);
                        sr.Close();
                        response.Close();
                    }
                }
            }
            return(result);
        }
Exemplo n.º 2
0
 /// <summary>
 /// Add a vertex (node) to the graph.
 /// </summary>
 /// <param name="wpi">WebPageInfo object</param>
 /// <returns></returns>
 public bool AddVertex(WebPageInfo v)
 {
     if (this.m_GraphData != null && v != null)
     {
         return(this.m_GraphData.AddVertex(v));
     }
     return(false);
 }
Exemplo n.º 3
0
 public VertexEdgeGroup(WebPageInfo _v, IEdge <WebPageInfo> _e)
     : this(_v)
 {
     Edge = _e;
 }
Exemplo n.º 4
0
 public VertexEdgeGroup(WebPageInfo _v)
 {
     Vertex = _v;
 }
Exemplo n.º 5
0
        /// <summary>
        /// Crawl a website. Give it an initial startingpoint (it's center) and let it do its magic.
        /// This implementation is called Depth-First Traversal, which means it goes all the way down
        /// the link chain (so to speak) before it visits it's neighbours.
        /// Page. 768 Datastructures
        /// </summary>
        /// <param name="Data.StartUrl">URL (link/website) starting point</param>
        /// <param name="Data.UrlType">Crawl only absolute URI, relative URI or both?</param>
        /// <param name="Data.OnlySameDomain">If not, then crawl the entire web or crash the computer... hmmm</param>
        /// <returns></returns>
        public Graph Crawl()
        {
            String startDomain = Data.StartUrl.Host; // e.g. www.example.com
            Queue <VertexEdgeGroup> traversalOrder = new Queue <VertexEdgeGroup>();
            Stack <WebPageInfo>     vertexStack    = new Stack <WebPageInfo>();
            WebPageInfo             originVertex   = GetWebPageInfo(Data.StartUrl);

            if (originVertex == null)
            {
                return(null);
            }
            traversalOrder.Enqueue(new VertexEdgeGroup(originVertex));
            vertexStack.Push(originVertex);
            Stack <Uri>   tempUrlList;
            List <string> processedUrls = new List <string>();

            if (Data.AllowedContentTypes == null)
            {
                Data.AllowedContentTypes = new String[] { MediaTypeNames.Text.Html, MediaTypeNames.Text.Plain, MediaTypeNames.Text.Xml };
            }

            while (vertexStack.Count > 0)
            {
                WebPageInfo topVertex = vertexStack.Pop();
                tempUrlList = new Stack <Uri>(topVertex.OutgoingLinks);
                if (Data.IsDebugMode)
                {
                    Console.WriteLine("Stack size: " + vertexStack.Count);
                    Console.WriteLine("topVertex URL: " + topVertex.Url);
                }
                while (tempUrlList.Count > 0) // has neighbour
                {
                    Uri url = tempUrlList.Peek();

                    Uri     urlRelative  = topVertex.Url.MakeRelativeUri(url);
                    var     currentHost  = url.Host;
                    Boolean isSameDomain = currentHost.Equals(startDomain);
                    var     pq           = urlRelative.OriginalString;
                    //if (pq.StartsWith("#"))
                    //{
                    //    processedUrls.Add(url);
                    //}

                    WebPageInfo nextNeighbor = null;
                    // Process if it's not a file
                    if (!topVertex.IsLocalFile)
                    {
                        nextNeighbor = GetWebPageInfo(url);
                        if (nextNeighbor == null)
                        {
                            goto PopTempUrl;
                        }
                    }


                    if (!Data.AllowedContentTypes.Any(w => w.Contains(nextNeighbor.ContentType)))
                    {
                        goto PopTempUrl;
                    }

                    var isUrlProcessed = processedUrls.Contains(url.AbsoluteUri.Replace("#", "")); // processedUrls.Find(u => Uri.Compare(u, new Uri(url.AbsoluteUri.Replace("#", "")), UriComponents.AbsoluteUri, UriFormat.UriEscaped, StringComparison.InvariantCulture) == 0) != null;
                    if (!isUrlProcessed)                                                           // Helper.IsUriSame(u, url))) //next neighbour not visited before
                    //if (!processedUrls.Contains(urlRelative)) //next neighbour not visited before

                    {
                        // Make sure that we only process url's from the same domain when Data.OnlySameDomain = true
                        if (Data.OnlySameDomain && !isSameDomain)
                        {
                            goto PopTempUrl;
                        }
                        if (!HostsProcessed.Contains(currentHost))
                        {
                            HostsProcessed.Add(currentHost);
                        }
                        if (HostsProcessed.Count >= Data.MaxDepth)
                        {
                            goto PopTempUrl;
                        }
                        if (Data.IsDebugMode)
                        {
                            //Console.WriteLine("" + topVertex.PageTitle + " --> " + url);
                            Console.WriteLine(vertexStack.Count + ", " + tempUrlList.Count + ", " + topVertex.Url + " > " + url);
                        }
                        var edge = new Edge <WebPageInfo>(topVertex, nextNeighbor);
                        if (OnAddEdgeEvent != null)
                        {
                            OnAddEdgeEvent(this, new EdgeEventArgs()
                            {
                                Edge = edge
                            });
                        }
                        processedUrls.Add(url.AbsoluteUri.Replace("#", ""));
                        //processedUrls.Add(urlRelative);
                        traversalOrder.Enqueue(new VertexEdgeGroup(nextNeighbor, edge));
                        vertexStack.Push(nextNeighbor);
                    }
                    goto PopTempUrl;
                    //}
                    //else
                    //{
                    //    Console.Write("Don't process url " + url + ", because it's not html or plain text document, but a " + ContentType);
                    //    goto PopTempUrl;
                    //}
PopTempUrl:
                    tempUrlList.Pop();
                }
                //vertexStack.Pop();
            }
            //Create and return the graph
            Graph newGraph = new Graph(Data.StartUrl);

            while (traversalOrder.Count > 0)
            {
                VertexEdgeGroup     group = traversalOrder.Dequeue();
                WebPageInfo         v     = group.Vertex;
                IEdge <WebPageInfo> e     = group.Edge;
                newGraph.AddVertex(v);
                if (e != null)
                {
                    newGraph.AddEdge(e);
                }
            }
            return(newGraph);
        }