Ejemplo n.º 1
0
        protected Resource Parse(Resource resource)
        {
            if (resource.ContentType.ToLower ().Contains ("html")) {

                using (Stream stream = System.IO.File.OpenRead(FullPath))
                using (StreamReader streamReader = new StreamReader(stream)) {

                    SgmlParser parser = new SgmlParser ();
                    XmlDocument xmlDom = parser.ParseSgml (streamReader);

                    Stack<XmlNode> nodes = new Stack<XmlNode> ();

                    foreach (XmlNode node in xmlDom.ChildNodes) {
                        nodes.Push (node);
                    }

                    while (nodes.Count > 0) {
                        var node = nodes.Pop ();

                        switch (node.NodeType) {
                        case XmlNodeType.None:
                        case XmlNodeType.XmlDeclaration:
                        case XmlNodeType.ProcessingInstruction:
                        case XmlNodeType.DocumentFragment:
                        case XmlNodeType.Whitespace:
                        case XmlNodeType.Entity:
                        case XmlNodeType.DocumentType:
                        case XmlNodeType.Document:
                        case XmlNodeType.Comment:
                        case XmlNodeType.Notation:
                        case XmlNodeType.SignificantWhitespace:
                        case XmlNodeType.EndElement:
                        case XmlNodeType.EndEntity:
                        case XmlNodeType.EntityReference:
                        case XmlNodeType.Attribute:
                        default:
                            continue;

                        case XmlNodeType.Element:

                            // split into words and get the x-path to the node
                            if (node.Value == null)
                                break;

                            string[] words = node.Value.Split (new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                            string xpath = GetXPathToNode (node);

                            System.Diagnostics.Debug.WriteLine (xpath + string.Join ("-", words));

                            break;
                        case XmlNodeType.Text:
                            break;
                        case XmlNodeType.CDATA:
                            break;
                        }

                        foreach (XmlNode innerNode in node.ChildNodes) {
                            nodes.Push (innerNode);
                        }
                    }
                }
            } else {

            }

            #region Old?!
            //
            //
            //			// Look for URIs which are already crawled
            //			foreach (DataRow row in ResourceCache.Resources.Rows) {
            //
            //				if (!Regex.IsMatch (content, (row ["Uri"] as string) + "\\W"))
            //					continue;
            //
            //				if (!resource.ReferencedResources.Contains (row ["Uri"]as string))
            //					resource.ReferencedResources.Add (row ["Uri"] as string);
            //			}
            //
            //			// Look for already crawled class- and instance-names
            //			foreach (DataRow row in NodesCache.NodesTable.Rows) {
            //
            //				if (!Regex.IsMatch (content, (row ["Label"] as string) + "\\W"))
            //					continue;
            //
            //				if (row ["Type"] as string == "Class")
            //					resource.ReferencedClasses.Add (row ["Id"] as string);
            //				else if (row ["Type"] as string == "Instance")
            //					resource.ReferencedObjects.Add (row ["Id"] as string);
            //			}

            #endregion

            return resource;
        }
Ejemplo n.º 2
0
        protected Resource Parse(Resource resource)
        {
            if (resource.ContentType.ToLower().Contains("html"))
            {
                using (Stream stream = System.IO.File.OpenRead(FullPath))
                    using (StreamReader streamReader = new StreamReader(stream)) {
                        SgmlParser  parser = new SgmlParser();
                        XmlDocument xmlDom = parser.ParseSgml(streamReader);

                        Stack <XmlNode> nodes = new Stack <XmlNode> ();

                        foreach (XmlNode node in xmlDom.ChildNodes)
                        {
                            nodes.Push(node);
                        }

                        while (nodes.Count > 0)
                        {
                            var node = nodes.Pop();

                            switch (node.NodeType)
                            {
                            case XmlNodeType.None:
                            case XmlNodeType.XmlDeclaration:
                            case XmlNodeType.ProcessingInstruction:
                            case XmlNodeType.DocumentFragment:
                            case XmlNodeType.Whitespace:
                            case XmlNodeType.Entity:
                            case XmlNodeType.DocumentType:
                            case XmlNodeType.Document:
                            case XmlNodeType.Comment:
                            case XmlNodeType.Notation:
                            case XmlNodeType.SignificantWhitespace:
                            case XmlNodeType.EndElement:
                            case XmlNodeType.EndEntity:
                            case XmlNodeType.EntityReference:
                            case XmlNodeType.Attribute:
                            default:
                                continue;

                            case XmlNodeType.Element:

                                // split into words and get the x-path to the node
                                if (node.Value == null)
                                {
                                    break;
                                }

                                string[] words = node.Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                                string   xpath = GetXPathToNode(node);

                                System.Diagnostics.Debug.WriteLine(xpath + string.Join("-", words));

                                break;

                            case XmlNodeType.Text:
                                break;

                            case XmlNodeType.CDATA:
                                break;
                            }

                            foreach (XmlNode innerNode in node.ChildNodes)
                            {
                                nodes.Push(innerNode);
                            }
                        }
                    }
            }
            else
            {
            }


            #region Old?!
//
//
//			// Look for URIs which are already crawled
//			foreach (DataRow row in ResourceCache.Resources.Rows) {
//
//				if (!Regex.IsMatch (content, (row ["Uri"] as string) + "\\W"))
//					continue;
//
//				if (!resource.ReferencedResources.Contains (row ["Uri"]as string))
//					resource.ReferencedResources.Add (row ["Uri"] as string);
//			}
//
//			// Look for already crawled class- and instance-names
//			foreach (DataRow row in NodesCache.NodesTable.Rows) {
//
//				if (!Regex.IsMatch (content, (row ["Label"] as string) + "\\W"))
//					continue;
//
//				if (row ["Type"] as string == "Class")
//					resource.ReferencedClasses.Add (row ["Id"] as string);
//				else if (row ["Type"] as string == "Instance")
//					resource.ReferencedObjects.Add (row ["Id"] as string);
//			}

            #endregion

            return(resource);
        }