public static UriItemToken AttachIcqContent(Pullenti.Ner.Token t0) { if (!(t0 is Pullenti.Ner.NumberToken)) { return(null); } UriItemToken res = AttachISBN(t0); if (res == null) { return(null); } if (res.Value.Contains("-")) { res.Value = res.Value.Replace("-", ""); } foreach (char ch in res.Value) { if (!char.IsDigit(ch)) { return(null); } } if ((res.Value.Length < 6) || res.Value.Length > 10) { return(null); } return(res); }
public static UriItemToken AttachUriContent(Pullenti.Ner.Token t0, bool afterHttp) { UriItemToken res = _AttachUriContent(t0, ".;:-_=+&%#@/\\?[]()!~", afterHttp); if (res == null) { return(null); } if (res.EndToken.IsCharOf(".;-:") && res.EndChar > 3) { res.EndToken = res.EndToken.Previous; res.Value = res.Value.Substring(0, res.Value.Length - 1); } if (res.Value.EndsWith("/")) { res.Value = res.Value.Substring(0, res.Value.Length - 1); } if (res.Value.EndsWith("\\")) { res.Value = res.Value.Substring(0, res.Value.Length - 1); } if (res.Value.IndexOf('\\') > 0) { res.Value = res.Value.Replace('\\', '/'); } return(res); }
public static UriItemToken AttachSkype(Pullenti.Ner.Token t0) { if (t0.Chars.IsCyrillicLetter) { return(null); } UriItemToken res = _AttachUriContent(t0, "._", false); if (res == null) { return(null); } if (res.Value.Length < 5) { return(null); } return(res); }
public static UriItemToken AttachUrl(Pullenti.Ner.Token t0) { UriItemToken srv = AttachDomainName(t0, true, false); if (srv == null) { return(null); } StringBuilder txt = new StringBuilder(srv.Value); Pullenti.Ner.Token t1 = srv.EndToken; if (t1.Next != null && t1.Next.IsChar(':') && (t1.Next.Next is Pullenti.Ner.NumberToken)) { t1 = t1.Next.Next; txt.AppendFormat(":{0}", (t1 as Pullenti.Ner.NumberToken).Value); } else if ((srv.Value == "vk.com" && t1.Next != null && t1.Next.IsHiphen) && t1.Next.Next != null) { t1 = t1.Next.Next; UriItemToken dat = _AttachUriContent(t1, ".-_+%", false); if (dat != null) { t1 = dat.EndToken; txt.AppendFormat("/{0}", dat.Value); } } for (Pullenti.Ner.Token t = t1.Next; t != null; t = t.Next) { if (t.IsWhitespaceBefore) { break; } if (!t.IsChar('/')) { break; } if (t.IsWhitespaceAfter) { t1 = t; break; } UriItemToken dat = _AttachUriContent(t.Next, ".-_+%", false); if (dat == null) { t1 = t; break; } t = (t1 = dat.EndToken); txt.AppendFormat("/{0}", dat.Value); } if ((t1.Next != null && t1.Next.IsChar('?') && !t1.Next.IsWhitespaceAfter) && !t1.IsWhitespaceAfter) { UriItemToken dat = _AttachUriContent(t1.Next.Next, ".-_+%=&", false); if (dat != null) { t1 = dat.EndToken; txt.AppendFormat("?{0}", dat.Value); } } if ((t1.Next != null && t1.Next.IsChar('#') && !t1.Next.IsWhitespaceAfter) && !t1.IsWhitespaceAfter) { UriItemToken dat = _AttachUriContent(t1.Next.Next, ".-_+%", false); if (dat != null) { t1 = dat.EndToken; txt.AppendFormat("#{0}", dat.Value); } } int i; for (i = 0; i < txt.Length; i++) { if (char.IsLetter(txt[i])) { break; } } if (i >= txt.Length) { return(null); } return(new UriItemToken(t0, t1) { Value = txt.ToString() }); }
static UriItemToken _AttachUriContent(Pullenti.Ner.Token t0, string chars, bool canBeWhitespaces = false) { StringBuilder txt = new StringBuilder(); Pullenti.Ner.Token t1 = t0; UriItemToken dom = AttachDomainName(t0, true, canBeWhitespaces); if (dom != null) { if (dom.Value.Length < 3) { return(null); } } char openChar = (char)0; Pullenti.Ner.Token t = t0; if (dom != null) { t = dom.EndToken.Next; } for (; t != null; t = t.Next) { if (t != t0 && t.IsWhitespaceBefore) { if (t.IsNewlineBefore || !canBeWhitespaces) { break; } if (dom == null) { break; } if (t.Previous.IsHiphen) { } else if (t.Previous.IsCharOf(",;")) { break; } else if (t.Previous.IsChar('.') && t.Chars.IsLetter && t.LengthChar == 2) { } else { bool ok = false; Pullenti.Ner.Token tt1 = t; if (t.IsCharOf("\\/")) { tt1 = t.Next; } Pullenti.Ner.Token tt0 = tt1; for (; tt1 != null; tt1 = tt1.Next) { if (tt1 != tt0 && tt1.IsWhitespaceBefore) { break; } if (tt1 is Pullenti.Ner.NumberToken) { continue; } if (!(tt1 is Pullenti.Ner.TextToken)) { break; } string term1 = (tt1 as Pullenti.Ner.TextToken).Term; if (((term1 == "HTM" || term1 == "HTML" || term1 == "SHTML") || term1 == "ASP" || term1 == "ASPX") || term1 == "JSP") { ok = true; break; } if (!tt1.Chars.IsLetter) { if (tt1.IsCharOf("\\/")) { ok = true; break; } if (!tt1.IsCharOf(chars)) { break; } } else if (!tt1.Chars.IsLatinLetter) { break; } } if (!ok) { break; } } } if (t is Pullenti.Ner.NumberToken) { Pullenti.Ner.NumberToken nt = t as Pullenti.Ner.NumberToken; txt.Append(nt.GetSourceText()); t1 = t; continue; } Pullenti.Ner.TextToken tt = t as Pullenti.Ner.TextToken; if (tt == null) { Pullenti.Ner.ReferentToken rt = t as Pullenti.Ner.ReferentToken; if (rt != null && rt.BeginToken.IsValue("РФ", null)) { if (txt.Length > 0 && txt[txt.Length - 1] == '.') { txt.Append(rt.BeginToken.GetSourceText()); t1 = t; continue; } } if (rt != null && rt.Chars.IsLatinLetter && rt.BeginToken == rt.EndToken) { txt.Append(rt.BeginToken.GetSourceText()); t1 = t; continue; } break; } string src = tt.GetSourceText(); char ch = src[0]; if (!char.IsLetter(ch)) { if (chars.IndexOf(ch) < 0) { break; } if (ch == '(' || ch == '[') { openChar = ch; } else if (ch == ')') { if (openChar != '(') { break; } openChar = (char)0; } else if (ch == ']') { if (openChar != '[') { break; } openChar = (char)0; } } txt.Append(src); t1 = t; } if (txt.Length == 0) { return(dom); } int i; for (i = 0; i < txt.Length; i++) { if (char.IsLetterOrDigit(txt[i])) { break; } } if (i >= txt.Length) { return(dom); } if (txt[txt.Length - 1] == '.' || txt[txt.Length - 1] == '/') { txt.Length--; t1 = t1.Previous; } if (dom != null) { txt.Insert(0, dom.Value); } string tmp = txt.ToString(); if (tmp.StartsWith("\\\\")) { txt.Replace("\\\\", "//"); tmp = txt.ToString(); } if (tmp.StartsWith("//")) { tmp = tmp.Substring(2); } if (string.Compare(tmp, "WWW", true) == 0) { return(null); } UriItemToken res = new UriItemToken(t0, t1) { Value = txt.ToString() }; return(res); }