public static SiteEntity ReadSiteEntityXml(string xmlPath)
        {
            SiteEntity siteEntity = new SiteEntity();

            XmlDocument doc = new XmlDocument();
            doc.Load(xmlPath);

            XmlElement xmlElement = doc.DocumentElement;
            siteEntity.Name = xmlElement["Name"].InnerText;
            siteEntity.RootUrl = xmlElement["RootUrl"].InnerText;

            xmlElement = xmlElement["CrawlUrlsConfig"];
            siteEntity.MaxDepth = int.Parse(xmlElement["MaxDepth"].InnerText);

            foreach (XmlElement startPoint in xmlElement.ChildNodes)
            {
                if (startPoint.Name.Equals("StartPoint"))
                {
                    siteEntity.StartPointUrls.Add(startPoint.InnerText);
                }
            }

            xmlElement = xmlElement["NoFollow"];

            foreach (XmlElement regex in xmlElement.ChildNodes)
            {
                if (regex.Name.Equals("RegEx"))
                {
                    siteEntity.NoFollowExpressions.Add(regex.InnerText);
                }
            }

            xmlElement = doc.DocumentElement["PageExtractionElements"];

            foreach (XmlElement element in xmlElement.ChildNodes)
            {
                if (element.Name.Equals("Element"))
                {
                    ExtractionElement extractionElement = new ExtractionElement();
                    extractionElement.Name = element.Attributes["Name"].Value;
                    extractionElement.RegEx = element.Attributes["RegEx"].Value;

                    foreach (XmlElement field in element.ChildNodes)
                    {
                        if (field.Name.Equals("Field"))
                        {
                            ExtractionElement fieldElement = new ExtractionElement();
                            fieldElement.Name = field.Attributes["Name"].Value;
                            fieldElement.RegEx = field.Attributes["RegEx"].Value;

                            extractionElement.Fields.Add(fieldElement.Name, fieldElement);
                        }
                    }

                    siteEntity.ExtractionElements.Add(extractionElement);
                }
            }

            return siteEntity;
        }
Beispiel #2
0
        public void Initialize(SiteEntity crawlSite, int crawlersCapacity)
        {
            //save the params
            m_SiteEntity = crawlSite;
            m_CrawlersCapacity = crawlersCapacity;

            //init the storage (XML)
            //m_LinksStorage = new RegExSpider.Storage.XmlProvider.LinksStorage();
            //m_ElementStorage = new RegExSpider.Storage.XmlProvider.ElementStorage();

            //m_LinksStorage.InitializeStorage(null);
            //m_ElementStorage.InitializeStorage(null);

            //init the storage (MySql)
            m_LinksStorage = new RegExSpider.Storage.MySQLProvider.LinksStorage();
            m_ElementStorage = new RegExSpider.Storage.MySQLProvider.ElementStorage();

            m_LinksStorage.InitializeStorage(new string[] { "Server=localhost;Database=regexspider;Uid=root;Pwd=tizmon;" });
            m_ElementStorage.InitializeStorage(new string[] { "Server=localhost;Database=regexspider;Uid=root;Pwd=tizmon;" });

            //load base crawl urls
            if (m_SiteEntity.StartPointUrls.Count > 0)
            {
                foreach (var item in m_SiteEntity.StartPointUrls)
                {
                    m_LinksCounter++;

                    if (m_LinksStorage.IsExists(item) == false)
                        m_LinksStorage.InsertLink(new LinkEntity(m_LinksCounter, 0, item));
                }
            }
            else
            {
                if (m_LinksStorage.IsExists(m_SiteEntity.RootUrl) == false)
                    m_LinksStorage.InsertLink(new LinkEntity(m_LinksCounter, 0, m_SiteEntity.RootUrl));
            }
        }
        public PageHandler(SiteEntity ownerSite)
        {
            m_Results = new List<ElementEntity>();

            m_SiteEntity = ownerSite;
        }