private void process(IElement node, bool root) { IList<RDFa.IncompleteTriple> incompleteTriplesLocal=new List<RDFa.IncompleteTriple>(); string localLanguage=context.language; RDFTerm newSubject=null; bool recurse=true; bool skipElement=false; RDFTerm currentObject=null; IDictionary<string,string> namespacesLocal= new PeterO.Support.LenientDictionary<string,string>(context.namespaces); IDictionary<string,string> iriMapLocal= new PeterO.Support.LenientDictionary<string,string>(context.iriMap); string attr=null; if(!xhtml){ attr=node.getAttribute("xml:base"); if(attr!=null){ context.baseURI=URIUtility.relativeResolve(attr, context.baseURI); } } // Support XML namespaces foreach(var attrib in node.getAttributes()){ string name=StringUtility.toLowerCaseAscii(attrib.getName()); //Console.WriteLine(attrib); if(name.Equals("xmlns")){ //Console.WriteLine("xmlns %s",attrib.getValue()); iriMapLocal.Add("", attrib.getValue()); namespacesLocal.Add("", attrib.getValue()); } else if(name.StartsWith("xmlns:",StringComparison.Ordinal) && name.Length>6){ string prefix=name.Substring(6); //Console.WriteLine("xmlns %s %s",prefix,attrib.getValue()); if(!"_".Equals(prefix)){ iriMapLocal.Add(prefix, attrib.getValue()); } namespacesLocal.Add(prefix, attrib.getValue()); } } attr=node.getAttribute("xml:lang"); if(attr!=null){ localLanguage=attr; } // Support RDF/XML metadata if(node.getLocalName().Equals("RDF") && RDF_NAMESPACE.Equals(node.getNamespaceURI())){ miniRdfXml(node,context); return; } string rel=node.getAttribute("rel"); string rev=node.getAttribute("rev"); string property=node.getAttribute("property"); string content=node.getAttribute("content"); string datatype=node.getAttribute("datatype"); if(rel==null && rev==null){ // Step 4 RDFTerm resource=getSafeCurieOrCurieOrIri( node.getAttribute("about"),iriMapLocal); if(resource==null){ resource=getSafeCurieOrCurieOrIri( node.getAttribute("resource"),iriMapLocal); } if(resource==null){ resource=relativeResolve(node.getAttribute("href")); } if(resource==null){ resource=relativeResolve(node.getAttribute("src")); } if((resource==null || resource.getKind()!=RDFTerm.IRI)){ string rdfTypeof=getCurie(node.getAttribute("typeof"),iriMapLocal); if(isHtmlElement(node, "head") || isHtmlElement(node, "body")){ resource=getSafeCurieOrCurieOrIri("",iriMapLocal); } if(resource==null && !xhtml && root){ resource=getSafeCurieOrCurieOrIri("",iriMapLocal); } if(resource==null && rdfTypeof!=null){ resource=generateBlankNode(); } if(resource==null){ if(context.parentObject!=null) { resource=context.parentObject; } if(node.getAttribute("property")==null){ skipElement=true; } } newSubject=resource; } else { newSubject=resource; } } else { // Step 5 RDFTerm resource=getSafeCurieOrCurieOrIri( node.getAttribute("about"),iriMapLocal); if(resource==null){ resource=relativeResolve(node.getAttribute("src")); } if((resource==null || resource.getKind()!=RDFTerm.IRI)){ string rdfTypeof=getCurie(node.getAttribute("typeof"),iriMapLocal); if(isHtmlElement(node, "head") || isHtmlElement(node, "body")){ resource=getSafeCurieOrCurieOrIri("",iriMapLocal); } if(resource==null && !xhtml && root){ resource=getSafeCurieOrCurieOrIri("",iriMapLocal); } if(resource==null && rdfTypeof!=null){ resource=generateBlankNode(); } if(resource==null){ if(context.parentObject!=null) { resource=context.parentObject; } } newSubject=resource; } else { newSubject=resource; } resource=getSafeCurieOrCurieOrIri( node.getAttribute("resource"),iriMapLocal); if(resource==null){ resource=relativeResolve(node.getAttribute("href")); } currentObject=resource; } // Step 6 if(newSubject!=null){ string[] types=StringUtility.splitAtNonFFSpaces(node.getAttribute("typeof")); foreach(var type in types){ string iri=getCurie(type,iriMapLocal); if(iri!=null){ outputGraph.Add(new RDFTriple( newSubject,RDFTerm.A, RDFTerm.fromIRI(iri) )); } } } // Step 7 if(currentObject!=null){ string[] types=StringUtility.splitAtNonFFSpaces(rel); foreach(var type in types){ string iri=getRelTermOrCurie(type, iriMapLocal); #if DEBUG if(!(newSubject!=null))throw new InvalidOperationException("doesn't satisfy newSubject!=null"); #endif if(iri!=null){ outputGraph.Add(new RDFTriple( newSubject, RDFTerm.fromIRI(iri),currentObject )); } } types=StringUtility.splitAtNonFFSpaces(rev); foreach(var type in types){ string iri=getRelTermOrCurie(type, iriMapLocal); if(iri!=null){ outputGraph.Add(new RDFTriple( currentObject, RDFTerm.fromIRI(iri), newSubject )); } } } else { // Step 8 string[] types=StringUtility.splitAtNonFFSpaces(rel); bool hasPredicates=false; // Defines predicates foreach(var type in types){ string iri=getRelTermOrCurie(type, iriMapLocal); if(iri!=null){ if(!hasPredicates){ hasPredicates=true; currentObject=generateBlankNode(); } RDFa.IncompleteTriple inc=new RDFa.IncompleteTriple(); inc.predicate=RDFTerm.fromIRI(iri); inc.direction=RDFa.ChainingDirection.Forward; incompleteTriplesLocal.Add(inc); } } types=StringUtility.splitAtNonFFSpaces(rev); foreach(var type in types){ string iri=getRelTermOrCurie(type, iriMapLocal); if(iri!=null){ if(!hasPredicates){ hasPredicates=true; currentObject=generateBlankNode(); } RDFa.IncompleteTriple inc=new RDFa.IncompleteTriple(); inc.predicate=RDFTerm.fromIRI(iri); inc.direction=RDFa.ChainingDirection.Reverse; incompleteTriplesLocal.Add(inc); } } } // Step 9 string[] preds=StringUtility.splitAtNonFFSpaces(property); string datatypeValue=getCurie(datatype, iriMapLocal); if(datatype!=null && datatypeValue==null) { datatypeValue=""; } //Console.WriteLine("datatype=[%s] prop=%s vocab=%s", // datatype,property,localDefaultVocab); //Console.WriteLine("datatypeValue=[%s]",datatypeValue); RDFTerm currentProperty=null; foreach(var pred in preds){ string iri=getCurie(pred, iriMapLocal); if(iri!=null){ //Console.WriteLine("iri=[%s]",iri); currentProperty=null; if(datatypeValue!=null && datatypeValue.Length>0 && !datatypeValue.Equals(RDF_XMLLITERAL)){ string literal=content; if(literal==null) { literal=getTextNodeText(node); } currentProperty=RDFTerm.fromTypedString(literal,datatypeValue); } else if(node.getAttribute("content")!=null || !hasNonTextChildNodes(node) || (datatypeValue!=null && datatypeValue.Length==0)){ string literal=node.getAttribute("content"); if(literal==null) { literal=getTextNodeText(node); } currentProperty=(!string.IsNullOrEmpty(localLanguage)) ? RDFTerm.fromLangString(literal, localLanguage) : RDFTerm.fromTypedString(literal); } else if(hasNonTextChildNodes(node) && (datatypeValue==null || datatypeValue.Equals(RDF_XMLLITERAL))){ // XML literal recurse=false; if(datatypeValue==null) { datatypeValue=RDF_XMLLITERAL; } try { string literal=ExclusiveCanonicalXML.canonicalize(node, false, namespacesLocal); currentProperty=RDFTerm.fromTypedString(literal,datatypeValue); } catch(ArgumentException){ // failure to canonicalize } } #if DEBUG if(!(newSubject!=null))throw new InvalidOperationException("doesn't satisfy newSubject!=null"); #endif outputGraph.Add(new RDFTriple( newSubject, RDFTerm.fromIRI(iri),currentProperty )); } } // Step 10 if(!skipElement && newSubject!=null){ foreach(var triple in context.incompleteTriples){ if(triple.direction==RDFa.ChainingDirection.Forward){ outputGraph.Add(new RDFTriple( context.parentSubject, triple.predicate, newSubject)); } else { outputGraph.Add(new RDFTriple( newSubject,triple.predicate, context.parentSubject)); } } } // Step 13 if(recurse){ foreach(var childNode in node.getChildNodes()){ IElement childElement; RDFa.EvalContext oldContext=context; if(childNode is IElement){ childElement=((IElement)childNode); //Console.WriteLine("skip=%s vocab=%s local=%s", // skipElement,context.defaultVocab, //localDefaultVocab); if(skipElement){ RDFa.EvalContext ec=oldContext.copy(); ec.language=localLanguage; ec.iriMap=iriMapLocal; ec.namespaces=namespacesLocal; context=ec; process(childElement,false); } else { RDFa.EvalContext ec=new RDFa.EvalContext(); ec.baseURI=oldContext.baseURI; ec.iriMap=iriMapLocal; ec.namespaces=namespacesLocal; ec.incompleteTriples=incompleteTriplesLocal; ec.parentSubject=((newSubject==null) ? oldContext.parentSubject : newSubject); ec.parentObject=((currentObject==null) ? ((newSubject==null) ? oldContext.parentSubject : newSubject) : currentObject); ec.language=localLanguage; context=ec; process(childElement,false); } } context=oldContext; } } }
// Processes a subset of RDF/XML metadata // Doesn't implement RDF/XML completely private void miniRdfXml(IElement node, RDFa.EvalContext context, RDFTerm subject) { string language=context.language; foreach(var child in node.getChildNodes()){ IElement childElement=(child is IElement) ? ((IElement)child) : null; if(childElement==null) { continue; } if(node.getAttribute("xml:lang")!=null){ language=node.getAttribute("xml:lang"); } else { language=context.language; } if(childElement.getLocalName().Equals("Description") && RDF_NAMESPACE.Equals(childElement.getNamespaceURI())){ RDFTerm about=relativeResolve(childElement.getAttributeNS(RDF_NAMESPACE,"about")); //Console.WriteLine("about=%s [%s]",about,childElement.getAttribute("about")); if(about==null){ about=subject; if(about==null) { continue; } } foreach(var child2 in child.getChildNodes()){ IElement childElement2= ((child2 is IElement) ? ((IElement)child2) : null); if(childElement2==null) { continue; } miniRdfXmlChild(childElement2,about,language); } } else if(RDF_NAMESPACE.Equals(childElement.getNamespaceURI())) throw new NotSupportedException(); } }
private void miniRdfXmlChild(IElement node, RDFTerm subject, string language) { string nsname=node.getNamespaceURI(); if(node.getAttribute("xml:lang")!=null){ language=node.getAttribute("xml:lang"); } string localname=node.getLocalName(); RDFTerm predicate=relativeResolve(nsname+localname); if(!hasNonTextChildNodes(node)){ string content=getTextNodeText(node); RDFTerm literal; if(!string.IsNullOrEmpty(language)){ literal=RDFTerm.fromLangString(content, language); } else { literal=RDFTerm.fromTypedString(content); } outputGraph.Add(new RDFTriple(subject,predicate,literal)); } else { string parseType=node.getAttributeNS(RDF_NAMESPACE, "parseType"); if("Literal".Equals(parseType)) throw new NotSupportedException(); RDFTerm blank=generateBlankNode(); context.language=language; miniRdfXml(node,context,blank); outputGraph.Add(new RDFTriple(subject,predicate,blank)); } }
private static bool implyForLink(IElement root, JSONObject subProperties) { if(StringUtility.toLowerCaseAscii(root.getLocalName()).Equals("a") && root.getAttribute("href")!=null){ // get the link's URL setValueIfAbsent(subProperties,"url", getUValue(root)); IList<IElement> elements=getChildElements(root); if(elements.Count==1 && StringUtility.toLowerCaseAscii(elements[0].getLocalName()).Equals("img")){ string pValue=getPValue(elements[0]); // try to get the ALT/TITLE from the image if(StringUtility.isNullOrSpaces(pValue)) { pValue=getPValue(root); // if empty, get text from link instead } setValueIfAbsent(subProperties,"name", pValue); // get the SRC of the image setValueIfAbsent(subProperties,"photo", getUValue(elements[0])); } else { // get the text content string pvalue=getPValue(root); if(!StringUtility.isNullOrSpaces(pvalue)) { setValueIfAbsent(subProperties,"name", pvalue); } } return true; } return false; }
private static void propertyWalk(IElement root, JSONObject properties, JSONArray children) { string[] className=getClassNames(root); if(className.Length>0){ IList<string> types=new List<string>(); bool hasProperties=false; foreach(var cls in className){ if(cls.StartsWith("p-",StringComparison.Ordinal) && properties!=null){ hasProperties=true; } else if(cls.StartsWith("u-",StringComparison.Ordinal) && properties!=null){ hasProperties=true; } else if(cls.StartsWith("dt-",StringComparison.Ordinal) && properties!=null){ hasProperties=true; } else if(cls.StartsWith("e-",StringComparison.Ordinal) && properties!=null){ hasProperties=true; } else if(cls.StartsWith("h-",StringComparison.Ordinal)){ types.Add(cls); } } if(types.Count==0 && hasProperties){ // has properties and isn't a microformat // root foreach(var cls in className){ if(cls.StartsWith("p-",StringComparison.Ordinal)){ string value=getPValue(root); if(!StringUtility.isNullOrSpaces(value)) { accumulateValue(properties,cls.Substring(2),value); } } else if(cls.StartsWith("u-",StringComparison.Ordinal)){ accumulateValue(properties,cls.Substring(2), getUValue(root)); } else if(cls.StartsWith("dt-",StringComparison.Ordinal)){ accumulateValue(properties,cls.Substring(3), getDTValue(root,getLastKnownTime(properties))); } else if(cls.StartsWith("e-",StringComparison.Ordinal)){ accumulateValue(properties,cls.Substring(2), getEValue(root)); } } } else if(types.Count>0){ // this is a child microformat // with no properties JSONObject obj=new JSONObject(); obj.put("type", new JSONArray(types)); // for holding child elements with // properties JSONObject subProperties=new JSONObject(); // for holding child microformats with no // property class JSONArray subChildren=new JSONArray(); foreach(var child in root.getChildNodes()){ if(child is IElement) { propertyWalk((IElement)child, subProperties,subChildren); } } if(subChildren.Length>0){ obj.put("children", subChildren); } if(types.Count>0){ // we imply missing properties here // Imply p-name and p-url if(!implyForLink(root,subProperties)){ if(hasSingleChildElementNamed(root,"a")){ implyForLink(getFirstChildElement(root),subProperties); } else { string pvalue=getPValue(root); if(!StringUtility.isNullOrSpaces(pvalue)) { setValueIfAbsent(subProperties,"name", pvalue); } } } // Also imply u-photo if(StringUtility.toLowerCaseAscii(root.getLocalName()).Equals("img") && root.getAttribute("src")!=null){ setValueIfAbsent(subProperties,"photo", getUValue(root)); } if(!subProperties.has("photo")){ IList<IElement> images=root.getElementsByTagName("img"); // If there is only one descendant image, imply // u-photo if(images.Count==1){ setValueIfAbsent(subProperties,"photo", getUValue(images[0])); } } } obj.put("properties", subProperties); if(hasProperties){ foreach(var cls in className){ if(cls.StartsWith("p-",StringComparison.Ordinal)){ // property JSONObject clone=copyJson(obj); clone.put("value",getPValue(root)); accumulateValue(properties,cls.Substring(2),clone); } else if(cls.StartsWith("u-",StringComparison.Ordinal)){ // URL JSONObject clone=copyJson(obj); clone.put("value",getUValue(root)); accumulateValue(properties,cls.Substring(2),clone); } else if(cls.StartsWith("dt-",StringComparison.Ordinal)){ // date/time JSONObject clone=copyJson(obj); clone.put("value",getDTValue(root,getLastKnownTime(properties))); accumulateValue(properties,cls.Substring(3),clone); } else if(cls.StartsWith("e-",StringComparison.Ordinal)){ // date/time JSONObject clone=copyJson(obj); clone.put("value",getEValue(root)); accumulateValue(properties,cls.Substring(2),clone); } } } else { children.put(obj); } return; } } foreach(var child in root.getChildNodes()){ if(child is IElement) { propertyWalk((IElement)child,properties,children); } } }
private static string getValueElementContent(IElement valueElement) { if(hasClassName(valueElement,"value-title")) // If element has the value-title class, use // the title instead return valueOrEmpty(valueElement.getAttribute("title")); else if(elementName(valueElement).Equals("img") || elementName(valueElement).Equals("area")){ string s=valueElement.getAttribute("alt"); return (s==null) ? "" : s; } else if(elementName(valueElement).Equals("data")){ string s=valueElement.getAttribute("value"); return (s==null) ? getTrimmedTextContent(valueElement) : s; } else if(elementName(valueElement).Equals("abbr")){ string s=valueElement.getAttribute("title"); return (s==null) ? getTrimmedTextContent(valueElement) : s; } else return getTrimmedTextContent(valueElement); }
private static bool hasClassName(IElement e, string className) { string attr=e.getAttribute("class"); if(attr==null || attr.Length<className.Length)return false; string[] cls=StringUtility.splitAtSpaces(attr); foreach(var c in cls){ if(c.Equals(className))return true; } return false; }
private static string[] getRelNames(IElement element) { string[] ret=StringUtility.splitAtSpaces( StringUtility.toLowerCaseAscii(element.getAttribute("rel"))); if(ret.Length==0)return ret; IList<string> retList=new List<string>(); foreach(var element2 in ret) { retList.Add(element2); } if(retList.Count>=2){ ISet<string> stringSet=new HashSet<string>(retList); return PeterO.Support.Collections.ToArray(stringSet); } else return retList.ToArray(); }
private static string getPValue(IElement root) { if(root.getAttribute("title")!=null) return root.getAttribute("title"); if(StringUtility.toLowerCaseAscii(root.getLocalName()).Equals("img") && !StringUtility.isNullOrSpaces(root.getAttribute("alt"))) return root.getAttribute("alt"); return getValueContent(root,false); }
private static string getHref(IElement node) { string name=StringUtility.toLowerCaseAscii(node.getLocalName()); string href=""; if("a".Equals(name) || "link".Equals(name) || "area".Equals(name)){ href=node.getAttribute("href"); } else if("object".Equals(name)){ href=node.getAttribute("data"); } else if("img".Equals(name) || "source".Equals(name) || "track".Equals(name) || "iframe".Equals(name) || "audio".Equals(name) || "video".Equals(name) || "embed".Equals(name)){ href=node.getAttribute("src"); } else return null; if(href==null || href.Length==0) return ""; href=HtmlDocument.resolveURL(node,href,null); if(href==null || href.Length==0) return ""; return href; }
private static string getDTValueContent(IElement valueElement) { string elname=elementName(valueElement); string text=""; if(hasClassName(valueElement,"value-title")) return valueOrEmpty(valueElement.getAttribute("title")); else if(elname.Equals("img") || elname.Equals("area")){ string s=valueElement.getAttribute("alt"); text=(s==null) ? "" : s; } else if(elname.Equals("data")){ string s=valueElement.getAttribute("value"); text=(s==null) ? getTrimmedTextContent(valueElement) : s; } else if(elname.Equals("abbr")){ string s=valueElement.getAttribute("title"); text=(s==null) ? getTrimmedTextContent(valueElement) : s; } else if(elname.Equals("del") || elname.Equals("ins") || elname.Equals("time")){ string s=valueElement.getAttribute("datetime"); if(StringUtility.isNullOrSpaces(s)) { s=valueElement.getAttribute("title"); } text=(s==null) ? getTrimmedTextContent(valueElement) : s; } else { text=getTrimmedTextContent(valueElement); } return text; }
private static string[] getClassNames(IElement element) { string[] ret=StringUtility.splitAtSpaces(element.getAttribute("class")); string[] rel=parseLegacyRel(element.getAttribute("rel")); if(ret.Length==0 && rel.Length==0)return ret; // Replace old microformats class names with // their modern versions IList<string> retList=new List<string>(); foreach(var element2 in rel) { retList.Add(element2); } foreach(var element2 in ret) { string legacyLabel=legacyLabelsMap[element2]; if(complexLegacyMap.ContainsKey(element2)){ foreach(var item in complexLegacyMap[element2]){ retList.Add(item); } } else if(legacyLabel!=null) { retList.Add(legacyLabel); } else { retList.Add(element2); } } if(retList.Count>=2){ ISet<string> stringSet=new HashSet<string>(retList); return PeterO.Support.Collections.ToArray(stringSet); } else return retList.ToArray(); }