private static void percentEncodeUtf8(IntList buffer, int cp) { if(cp<=0x7F){ buffer.appendInt('%'); buffer.appendInt(hex[(cp>>4)&0x0F]); buffer.appendInt(hex[(cp)&0x0F]); } else if(cp<=0x7FF){ percentEncode(buffer,(0xC0|((cp>>6)&0x1F))); percentEncode(buffer,(0x80|(cp &0x3F))); } else if(cp<=0xFFFF){ percentEncode(buffer,(0xE0|((cp>>12)&0x0F))); percentEncode(buffer,(0x80|((cp>>6 )&0x3F))); percentEncode(buffer,(0x80|(cp &0x3F))); } else { percentEncode(buffer,(0xF0|((cp>>18)&0x07))); percentEncode(buffer,(0x80|((cp>>12)&0x3F))); percentEncode(buffer,(0x80|((cp>>6 )&0x3F))); percentEncode(buffer,(0x80|(cp &0x3F))); } }
public static URL parse(string s, URL baseurl, string encoding, bool strict) { if(s==null) throw new ArgumentException(); int beginning=0; int ending=s.Length-1; bool relative=false; URL url=new URL(); ITextEncoder encoder=null; ParseState state=ParseState.SchemeStart; if(encoding!=null){ encoder=TextEncoding.getEncoder(encoding); } if(s.IndexOf("http://",StringComparison.Ordinal)==0){ state=ParseState.AuthorityIgnoreSlashes; url.scheme="http"; beginning=7; relative=true; } else { while(beginning<s.Length){ char c=s[beginning]; if(c!=0x09 && c!=0x0a && c!=0x0c && c!=0x0d && c!=0x20){ break; } beginning++; } } while(ending>=beginning){ char c=s[ending]; if(c!=0x09 && c!=0x0a && c!=0x0c && c!=0x0d && c!=0x20){ ending++; break; } ending--; } if(ending<beginning) { ending=beginning; } bool atflag=false; bool bracketflag=false; IntList buffer=new IntList(); IntList query=null; IntList fragment=null; IntList password=null; IntList username=null; IntList schemeData=null; bool error=false; IList<string> path=new List<string>(); int index=beginning; int hostStart=-1; int portstate=0; while(index<=ending){ int oldindex=index; int c=-1; if(index>=ending){ c=-1; index++; } else { c=s[index]; if(c>=0xD800 && c<=0xDBFF && index+1<ending && s[index+1]>=0xDC00 && s[index+1]<=0xDFFF){ // Get the Unicode code point for the surrogate pair c=0x10000+(c-0xD800)*0x400+(s[index+1]-0xDC00); index++; } else if(c>=0xD800 && c<=0xDFFF) // illegal surrogate throw new ArgumentException(); index++; } switch(state){ case ParseState.SchemeStart: if(c>='A' && c<='Z'){ buffer.appendInt(c+0x20); state=ParseState.Scheme; } else if(c>='a' && c<='z'){ buffer.appendInt(c); state=ParseState.Scheme; } else { index=oldindex; state=ParseState.NoScheme; } break; case ParseState.Scheme: if(c>='A' && c<='Z'){ buffer.appendInt(c+0x20); } else if((c>='a' && c<='z') || c=='.' || c=='-' || c=='+'){ buffer.appendInt(c); } else if(c==':'){ url.scheme=buffer.ToString(); buffer.clearAll(); if(url.scheme.Equals("http") || url.scheme.Equals("https") || url.scheme.Equals("ftp") || url.scheme.Equals("gopher") || url.scheme.Equals("ws") || url.scheme.Equals("wss") || url.scheme.Equals("file")){ relative=true; } if(url.scheme.Equals("file")){ state=ParseState.Relative; relative=true; } else if(relative && baseurl!=null && url.scheme.Equals(baseurl.scheme)){ state=ParseState.RelativeOrAuthority; } else if(relative){ state=ParseState.AuthorityFirstSlash; } else { schemeData=new IntList(); state=ParseState.SchemeData; } } else { buffer.clearAll(); index=beginning; state=ParseState.NoScheme; } break; case ParseState.SchemeData: if(c=='?'){ query=new IntList(); state=ParseState.Query; break; } else if(c=='#'){ fragment=new IntList(); state=ParseState.Fragment; break; } if((c>=0 && (!isUrlCodePoint(c) && c!='%') || (c=='%' && (index+2>ending || !isHexDigit(s[index]) || !isHexDigit(s[index+1]))))){ error=true; } if(c>=0 && c!=0x09 && c!=0x0a && c!=0x0d){ if(c<0x20 || c==0x7F){ percentEncode(schemeData,c); } else if(c<0x7F){ schemeData.appendInt(c); } else { percentEncodeUtf8(schemeData,c); } } break; case ParseState.NoScheme: if(baseurl==null) return null; //Console.WriteLine("no scheme: [%s] [%s]",s,baseurl); if(!(baseurl.scheme.Equals("http") || baseurl.scheme.Equals("https") || baseurl.scheme.Equals("ftp") || baseurl.scheme.Equals("gopher") || baseurl.scheme.Equals("ws") || baseurl.scheme.Equals("wss") || baseurl.scheme.Equals("file") )) return null; state=ParseState.Relative; index=oldindex; break; case ParseState.RelativeOrAuthority: if(c=='/' && index<ending && s[index]=='/'){ index++; state=ParseState.AuthorityIgnoreSlashes; } else { error=true; state=ParseState.Relative; index=oldindex; } break; case ParseState.Relative:{ relative=true; if(!"file".Equals(url.scheme)){ url.scheme=baseurl.scheme; } if(c<0){ url.host=baseurl.host; url.port=baseurl.port; path=pathList(baseurl.path); url.query=baseurl.query; } else if(c=='/' || c=='\\'){ if(c=='\\') { error=true; } state=ParseState.RelativeSlash; } else if(c=='?'){ url.host=baseurl.host; url.port=baseurl.port; path=pathList(baseurl.path); query=new IntList(); state=ParseState.Query; } else if(c=='#'){ url.host=baseurl.host; url.port=baseurl.port; path=pathList(baseurl.path); url.query=baseurl.query; fragment=new IntList(); state=ParseState.Fragment; } else { url.host=baseurl.host; url.port=baseurl.port; path=pathList(baseurl.path); if(path.Count>0) { // Pop path path.RemoveAt(path.Count-1); } state=ParseState.RelativePath; index=oldindex; } break; } case ParseState.RelativeSlash: if(c=='/' || c=='\\'){ if(c=='\\') { error=true; } if("file".Equals(url.scheme)){ state=ParseState.FileHost; } else { state=ParseState.AuthorityIgnoreSlashes; } } else { if(baseurl!=null){ url.host=baseurl.host; url.port=baseurl.port; } state=ParseState.RelativePath; index=oldindex; } break; case ParseState.AuthorityFirstSlash: if(c=='/'){ state=ParseState.AuthoritySecondSlash; } else { error=true; state=ParseState.AuthorityIgnoreSlashes; index=oldindex; } break; case ParseState.AuthoritySecondSlash: if(c=='/'){ state=ParseState.AuthorityIgnoreSlashes; } else { error=true; state=ParseState.AuthorityIgnoreSlashes; index=oldindex; } break; case ParseState.AuthorityIgnoreSlashes: if(c!='/' && c!='\\'){ username=new IntList(); index=oldindex; hostStart=index; state=ParseState.Authority; } else { error=true; } break; case ParseState.Authority: if(c=='@'){ if(atflag){ IntList result=(password==null) ? username : password; error=true; result.appendInt('%'); result.appendInt('4'); result.appendInt('0'); } atflag=true; int[] array=buffer.array(); for(int i=0;i<buffer.Count;i++){ int cp=array[i]; if(cp==0x9 || cp==0xa || cp==0xd){ error=true; continue; } if((!isUrlCodePoint(c) && c!='%') || (cp=='%' && (i+3>buffer.Count || !isHexDigit(array[index+1]) || !isHexDigit(array[index+2])))){ error=true; } if(cp==':' && password==null){ password=new IntList(); continue; } IntList result=(password==null) ? username : password; if(cp<=0x20 || cp>=0x7F || ((cp&0x7F)==cp && "#<>?`\"".IndexOf((char)cp)>=0)){ percentEncodeUtf8(result,cp); } else { result.appendInt(cp); } } //Console.WriteLine("username=%s",username); //Console.WriteLine("password=%s",password); buffer.clearAll(); hostStart=index; } else if(c<0 || ((c&0x7F)==c && "/\\?#".IndexOf((char)c)>=0)){ buffer.clearAll(); state=ParseState.Host; index=hostStart; } else { buffer.appendInt(c); } break; case ParseState.FileHost: if(c<0 || ((c&0x7F)==c && "/\\?#".IndexOf((char)c)>=0)){ index=oldindex; if(buffer.Count==2){ int c1=buffer[0]; int c2=buffer[1]; if((c2=='|' || c2==':') && ((c1>='A' && c1<='Z') || (c1>='a' && c1<='z'))){ state=ParseState.RelativePath; break; } } string host=hostParse(buffer.ToString()); if(host==null) throw new ArgumentException(); url.host=host; buffer.clearAll(); state=ParseState.RelativePathStart; } else if(c==0x09 || c==0x0a || c==0x0d){ error=true; } else { buffer.appendInt(c); } break; case ParseState.Host: case ParseState.HostName: if(c==':' && !bracketflag){ string host=hostParse(buffer.ToString()); if(host==null) return null; url.host=host; buffer.clearAll(); state=ParseState.Port; } else if(c<0 || ((c&0x7F)==c && "/\\?#".IndexOf((char)c)>=0)){ string host=hostParse(buffer.ToString()); if(host==null) return null; url.host=host; buffer.clearAll(); index=oldindex; state=ParseState.RelativePathStart; } else if(c==0x09 || c==0x0a || c==0x0d){ error=true; } else { if(c=='[') { bracketflag=true; } else if(c==']') { bracketflag=false; } buffer.appendInt(c); } break; case ParseState.Port: if(c>='0' && c<='9'){ if(c!='0') { portstate=2; // first non-zero found } else if(portstate==0){ portstate=1; // have a port number } if(portstate==2) { buffer.appendInt(c); } } else if(c<0 || ((c&0x7F)==c && "/\\?#".IndexOf((char)c)>=0)){ string bufport=""; if(portstate==1) { bufport="0"; } else if(portstate==2) { bufport=buffer.ToString(); } //Console.WriteLine("port: [%s]",buffer.ToString()); if((url.scheme.Equals("http") || url.scheme.Equals("ws")) && bufport.Equals("80")) { bufport=""; } if((url.scheme.Equals("https") || url.scheme.Equals("wss")) && bufport.Equals("443")) { bufport=""; } if((url.scheme.Equals("gopher")) && bufport.Equals("70")) { bufport=""; } if((url.scheme.Equals("ftp")) && bufport.Equals("21")) { bufport=""; } url.port=bufport; buffer.clearAll(); state=ParseState.RelativePathStart; index=oldindex; } else if(c==0x09 || c==0x0a || c==0x0d){ error=true; } else return null; break; case ParseState.Query: if(c<0 || c=='#'){ bool utf8=true; if(relative){ utf8=true; } if(utf8 || encoder==null){ // NOTE: Encoder errors can never happen in // this case for(int i=0;i<buffer.Count;i++){ int ch=buffer[i]; if(ch<0x21 || ch>0x7e || ch==0x22 || ch==0x23 || ch==0x3c || ch==0x3e || ch==0x60){ percentEncodeUtf8(query,ch); } else { query.appendInt(ch); } } } else { try { MemoryOutputStream baos=new MemoryOutputStream(); encoder.encode(baos,buffer.array(),0,buffer.Count,encodingError); byte[] bytes=baos.toByteArray(); foreach(var ch in bytes) { if(ch<0x21 || ch>0x7e || ch==0x22 || ch==0x23 || ch==0x3c || ch==0x3e || ch==0x60){ percentEncode(query,ch); } else { query.appendInt(ch); } } baos.Close(); } catch (IOException e) { throw e; } throw new InvalidOperationException(); } buffer.clearAll(); if(c=='#'){ fragment=new IntList(); state=ParseState.Fragment; } } else if(c==0x09 || c==0x0a || c==0x0d){ error=true; } else { if((!isUrlCodePoint(c) && c!='%') || (c=='%' && (index+2>ending || !isHexDigit(s[index]) || !isHexDigit(s[index+1])))){ error=true; } buffer.appendInt(c); } break; case ParseState.RelativePathStart: if(c=='\\'){ error=true; } state=ParseState.RelativePath; if((c!='/' && c!='\\')){ index=oldindex; } break; case ParseState.RelativePath: if((c<0 || c=='/' || c=='\\') || (c=='?' || c=='#')){ if(c=='\\') { error=true; } if(buffer.Count==2 && buffer[0]=='.' && buffer[1]=='.'){ if(path.Count>0){ path.RemoveAt(path.Count-1); } if((c!='/' && c!='\\')){ path.Add(""); } } else if(buffer.Count==1 && buffer[0]=='.'){ if((c!='/' && c!='\\')){ path.Add(""); } } else { if("file".Equals(url.scheme) && path.Count==0 && buffer.Count==2){ int c1=buffer[0]; int c2=buffer[1]; if((c2=='|' || c2==':') && ((c1>='A' && c1<='Z') || (c1>='a' && c1<='z'))){ buffer[1]=':'; } } path.Add(buffer.ToString()); } buffer.clearAll(); if(c=='?'){ query=new IntList(); state=ParseState.Query; } if(c=='#'){ fragment=new IntList(); state=ParseState.Fragment; } } else if(c=='%' && index+2<=ending && s[index]=='2' && (s[index+1]=='e' || s[index+1]=='E')){ index+=2; buffer.appendInt('.'); } else if(c==0x09 || c==0x0a || c==0x0d){ error=true; } else { if((!isUrlCodePoint(c) && c!='%') || (c=='%' && (index+2>ending || !isHexDigit(s[index]) || !isHexDigit(s[index+1])))){ error=true; } if(c<=0x20 || c>=0x7F || ((c&0x7F)==c && "#<>?`\"".IndexOf((char)c)>=0)){ percentEncodeUtf8(buffer,c); } else { buffer.appendInt(c); } } break; case ParseState.Fragment: if(c<0) { break; } if(c==0x09 || c==0x0a || c==0x0d) { error=true; } else { if((!isUrlCodePoint(c) && c!='%') || (c=='%' && (index+2>ending || !isHexDigit(s[index]) || !isHexDigit(s[index+1])))){ error=true; } if(c<0x20 || c==0x7F){ percentEncode(fragment,c); } else if(c<0x7F){ fragment.appendInt(c); } else { percentEncodeUtf8(fragment,c); } } break; default: throw new InvalidOperationException(); } } if(error && strict) return null; if(schemeData!=null) { url.schemeData=schemeData.ToString(); } StringBuilder builder=new StringBuilder(); if(path.Count==0){ builder.Append('/'); } else { foreach(var segment in path){ builder.Append('/'); builder.Append(segment); } } url.path=builder.ToString(); if(query!=null) { url.query=query.ToString(); } if(fragment!=null) { url.fragment=fragment.ToString(); } if(password!=null) { url.password=password.ToString(); } if(username!=null) { url.username=username.ToString(); } return url; }
private static void percentEncode(IntList buffer, int b) { buffer.appendInt('%'); buffer.appendInt(hex[(b>>4)&0x0F]); buffer.appendInt(hex[(b)&0x0F]); }