示例#1
0
        public void HandlePage(object state)
        {
            LinkEntity link = (LinkEntity)state;
            
            m_HtmlContent = Network.GetHTML(link.Url);

            List<string> foundUrls = RegEx.GetWebPageLinks(m_SiteEntity.RootUrl, m_HtmlContent);

            foreach (string noFollowRule in m_SiteEntity.NoFollowExpressions) //remove no follow urls.
            {
                foundUrls.RemoveAll(o => !string.IsNullOrEmpty(RegEx.GetRegexMatch(o, noFollowRule)));
            }

            OnFoundLinks(foundUrls,link.Depth);

            foreach (var extractionElement in m_SiteEntity.ExtractionElements) //extract the root elements
            {
                List<string> matches = GetElementsFromString(m_HtmlContent, extractionElement);

                foreach (var item in matches)
                {
                    Interlocked.Increment(ref m_ElementCounter);

                    ElementEntity newElement = new ElementEntity();
                    newElement.Id = m_ElementCounter;
                    newElement.Name = extractionElement.Name;
                    newElement.Value=item;

                    int fieldsCounter = 0;

                    foreach (var field in extractionElement.Fields)
                    {
                        string fieldMatch = GetElementFromString(newElement.Value, field.Value);

                        if (string.IsNullOrEmpty(fieldMatch) == false)
                        {
                            fieldsCounter++;

                            ElementEntity newField = new ElementEntity();
                            newField.Id = fieldsCounter;
                            newField.Name = field.Key;
                            newField.Value = fieldMatch;

                            newElement.Fields.Add(newField.Name, newField);
                        }
                    }

                    m_Results.Add(newElement);
                }
            }

            if (m_Results.Count > 0)
                OnFoundElements(m_Results);

            OnHandlingFinished(link);
        }        
        public void InsertElement(ElementEntity element)
        {
            lock (m_SyncXmlWriter)
            {
                m_XmlTextWriter.WriteStartElement("Element");

                m_XmlTextWriter.WriteStartElement("Name");
                m_XmlTextWriter.WriteString(element.Name);
                m_XmlTextWriter.WriteEndElement();
                m_XmlTextWriter.WriteStartElement("Value");
                m_XmlTextWriter.WriteString(element.Value);
                m_XmlTextWriter.WriteEndElement();

                m_XmlTextWriter.WriteStartElement("Fields");

                foreach (var field in element.Fields)
                {
                    m_XmlTextWriter.WriteStartElement("Field");

                    m_XmlTextWriter.WriteStartElement("Name");
                    m_XmlTextWriter.WriteString(field.Value.Name);
                    m_XmlTextWriter.WriteEndElement();

                    m_XmlTextWriter.WriteStartElement("Value");
                    m_XmlTextWriter.WriteString(field.Value.Value);
                    m_XmlTextWriter.WriteEndElement();

                    m_XmlTextWriter.WriteEndElement();
                }

                m_XmlTextWriter.WriteEndElement();

                m_XmlTextWriter.WriteEndElement();
            }

            m_Stored++;
        }