public static SiteEntity ReadSiteEntityXml(string xmlPath) { SiteEntity siteEntity = new SiteEntity(); XmlDocument doc = new XmlDocument(); doc.Load(xmlPath); XmlElement xmlElement = doc.DocumentElement; siteEntity.Name = xmlElement["Name"].InnerText; siteEntity.RootUrl = xmlElement["RootUrl"].InnerText; xmlElement = xmlElement["CrawlUrlsConfig"]; siteEntity.MaxDepth = int.Parse(xmlElement["MaxDepth"].InnerText); foreach (XmlElement startPoint in xmlElement.ChildNodes) { if (startPoint.Name.Equals("StartPoint")) { siteEntity.StartPointUrls.Add(startPoint.InnerText); } } xmlElement = xmlElement["NoFollow"]; foreach (XmlElement regex in xmlElement.ChildNodes) { if (regex.Name.Equals("RegEx")) { siteEntity.NoFollowExpressions.Add(regex.InnerText); } } xmlElement = doc.DocumentElement["PageExtractionElements"]; foreach (XmlElement element in xmlElement.ChildNodes) { if (element.Name.Equals("Element")) { ExtractionElement extractionElement = new ExtractionElement(); extractionElement.Name = element.Attributes["Name"].Value; extractionElement.RegEx = element.Attributes["RegEx"].Value; foreach (XmlElement field in element.ChildNodes) { if (field.Name.Equals("Field")) { ExtractionElement fieldElement = new ExtractionElement(); fieldElement.Name = field.Attributes["Name"].Value; fieldElement.RegEx = field.Attributes["RegEx"].Value; extractionElement.Fields.Add(fieldElement.Name, fieldElement); } } siteEntity.ExtractionElements.Add(extractionElement); } } return siteEntity; }
public void Initialize(SiteEntity crawlSite, int crawlersCapacity) { //save the params m_SiteEntity = crawlSite; m_CrawlersCapacity = crawlersCapacity; //init the storage (XML) //m_LinksStorage = new RegExSpider.Storage.XmlProvider.LinksStorage(); //m_ElementStorage = new RegExSpider.Storage.XmlProvider.ElementStorage(); //m_LinksStorage.InitializeStorage(null); //m_ElementStorage.InitializeStorage(null); //init the storage (MySql) m_LinksStorage = new RegExSpider.Storage.MySQLProvider.LinksStorage(); m_ElementStorage = new RegExSpider.Storage.MySQLProvider.ElementStorage(); m_LinksStorage.InitializeStorage(new string[] { "Server=localhost;Database=regexspider;Uid=root;Pwd=tizmon;" }); m_ElementStorage.InitializeStorage(new string[] { "Server=localhost;Database=regexspider;Uid=root;Pwd=tizmon;" }); //load base crawl urls if (m_SiteEntity.StartPointUrls.Count > 0) { foreach (var item in m_SiteEntity.StartPointUrls) { m_LinksCounter++; if (m_LinksStorage.IsExists(item) == false) m_LinksStorage.InsertLink(new LinkEntity(m_LinksCounter, 0, item)); } } else { if (m_LinksStorage.IsExists(m_SiteEntity.RootUrl) == false) m_LinksStorage.InsertLink(new LinkEntity(m_LinksCounter, 0, m_SiteEntity.RootUrl)); } }
public PageHandler(SiteEntity ownerSite) { m_Results = new List<ElementEntity>(); m_SiteEntity = ownerSite; }