public static string ToTitleCase(this string text) { var allLower = text.ToLowerFast(); var allUpper = text.ToUpper(); var result = ReseekableStringBuilder.AcquirePooledStringBuilder(); var firstLetter = true; for (int i = 0; i < text.Length; i++) { if (firstLetter) { result.Append(allUpper[i]); firstLetter = false; } else { result.Append(allLower[i]); } if (text[i] == ' ') { firstLetter = true; } } return(ReseekableStringBuilder.GetValueAndRelease(result)); }
public static string PascalCaseToNormalCase(this string text) { #if SALTARELLE var sb = new StringBuilder(); #else var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); #endif for (int i = 0; i < text.Length; i++) { var ch = text[i]; if (char.IsUpper(ch) && i != 0) { sb.Append(' '); #if SALTARELLE sb.Append(ch.ToString().ToLower()); #else sb.Append(char.ToLower(ch)); #endif } else if (ch == '.') { // Ignore } else { sb.Append(ch); } } #if SALTARELLE return(sb.ToString()); #else return(ReseekableStringBuilder.GetValueAndRelease(sb)); #endif }
internal string GetCookieHeader() { var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); var first = true; if (!string.IsNullOrEmpty(Cookies)) { sb.Append(Cookies); first = false; } if (CookiesList != null) { foreach (var item in CookiesList) { if (!first) { sb.Append("; "); } sb.Append(item.Name); sb.Append('='); sb.Append(HttpExtensionMethods.ToString(item.Value)); first = false; } } return(ReseekableStringBuilder.GetValueAndRelease(sb)); }
private static string ToHex(byte[] bytes) { var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); for (int i = 0; i < bytes.Length; i++) { var b = bytes[i]; sb.Append(Hex[b >> 4]); sb.Append(Hex[b & 0xF]); } return(ReseekableStringBuilder.GetValueAndRelease(sb)); }
public static string ToMatrix <T, TRow, TColumn, TCell>(this IEnumerable <T> items, Func <T, TRow> getRow, Func <T, TColumn> getColumn, Func <T, TCell> getCell) { var matrix = new Dictionary <TRow, Dictionary <TColumn, TCell> >(); var allColumns = new HashSet <TColumn>(); foreach (var item in items) { var row = getRow(item); var col = getColumn(item); var value = getCell(item); allColumns.Add(col); if (!matrix.ContainsKey(row)) { matrix[row] = new Dictionary <TColumn, TCell>(); } matrix[row].Add(col, value); } var theColumns = allColumns.OrderBy(x => x).ToList(); var theRows = matrix.Keys.OrderBy(x => x).ToList(); var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); foreach (var col in theColumns) { sb.Append('\t'); sb.Append(col); } sb.AppendLine(); foreach (var row in theRows) { sb.Append(row); var rowData = matrix[row]; foreach (var col in theColumns) { sb.Append('\t'); TCell value; if (rowData.TryGetValue(col, out value)) { sb.Append(RemoveForbiddenChars(value, '\t')); } } sb.AppendLine(); } return(ReseekableStringBuilder.GetValueAndRelease(sb)); }
internal string GetUrlStringIfNew() { NakedStringBuilder sb = null; string fragmentsToReapply = null; LoadInitialQueryParameters(); LoadInitialFragmentParameters(); if (queryParameters != null && nextQueryParameterToAdd != queryParameters.Count) { if (unparsedUrl != null) { url = unparsedUrl.AsUri(); unparsedUrl = null; } fragmentsToReapply = this.url != null ? this.url.Fragment : null; var initial = !string.IsNullOrEmpty(fragmentsToReapply) ? url.GetLeftPart_UriPartial_Query() : url.AbsoluteUri; sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); sb.Append(initial); HttpUtils.AppendParameters(queryParameters.Skip(nextQueryParameterToAdd), sb, '?'); nextQueryParameterToAdd = queryParameters.Count; } if (fragmentParameters != null && nextFragmentParameterToAdd != fragmentParameters.Count) { if (unparsedUrl != null) { url = unparsedUrl.AsUri(); unparsedUrl = null; } if (!string.IsNullOrEmpty(fragmentsToReapply)) { sb.Append(fragmentsToReapply); } else if (sb == null) { var initial = url.AbsoluteUri; sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); sb.Append(initial); } HttpUtils.AppendParameters(fragmentParameters.Skip(nextFragmentParameterToAdd), sb, '#'); nextFragmentParameterToAdd = fragmentParameters.Count; } else if (fragmentsToReapply != null && sb != null) { sb.Append(fragmentsToReapply); } return(sb != null?ReseekableStringBuilder.GetValueAndRelease(sb) : null); }
public string Serialize() { var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); var inter = (MonotoneCubicInterpolator)Interpolator; foreach (var item in inter.points) { sb.Append(item.Key.ToString(CultureInfo.InvariantCulture)); sb.Append(','); sb.Append(item.Value.ToString(CultureInfo.InvariantCulture)); sb.Append(','); } if (inter.points.Count != 0) { sb.Length--; } return(ReseekableStringBuilder.GetValueAndRelease(sb)); }
private static string UnescapeFieldName(string name) { if (name[0] != '_') { return(name); } var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); for (int i = 1; i < name.Length; i++) { char ch = name[i]; if (ch == '_') { var next = name[i + 1]; if (next == '_') { ch = '_'; i--; } else if (next == 'L') { ch = '['; } else if (next == 'J') { ch = ']'; } else if (next == 'X') { ch = '-'; } else { throw new FormatException(); } i += 2; } sb.Append(ch); } return(ReseekableStringBuilder.GetValueAndRelease(sb)); }
public static string ToPascalCase(this string text, bool allowDot) { var allLower = text.ToLowerFast(); var allUpper = text.ToUpper(); var result = ReseekableStringBuilder.AcquirePooledStringBuilder(); var firstLetter = true; for (int i = 0; i < text.Length; i++) { if (allowDot && text[i] == '.') { result.Append(allLower[i]); firstLetter = true; } #if SALTARELLE else if (AwdeeUtils.IsLetterOrDigit(text[i])) #else else if (char.IsLetterOrDigit(text[i])) #endif { if (firstLetter) { result.Append(allUpper[i]); firstLetter = false; } else { result.Append(text[i]); } } else { firstLetter = true; } } return(ReseekableStringBuilder.GetValueAndRelease(result)); }
protected override void Initialize() { // this.Parallelism = Math.Min(this.Parallelism, 6); var p = Page; if (p.StartsWith("http:") || p.StartsWith("https:")) { var uu = p.AsUri(); p = uu.AbsolutePath.Trim('/'); if (p.StartsWith("groups/")) { p = "groups/" + uu.GetPathComponent(1); } else if (uu.AbsolutePath.StartsWith("/profile.php")) { p = uu.GetQueryParameter("id"); } Page = p; } if (p.StartsWith("groups-")) { p = "groups/" + p.TrimStart("groups-"); } var root = ("https://m.facebook.com/" + p.TrimStart("/")).AsUri(); if (this.DestinationSuggestedName == null) { this.DestinationSuggestedName = "facebook-" + (root.GetQueryParameter("id") ?? root.AbsolutePath.TrimStart("/").TrimEnd("/").Replace("/", "-")); if (!ScrapeMobileVersion) { this.DestinationSuggestedName += "-desktop"; } } if (!ScrapeMobileVersion) { DatabaseInitialized += () => { if (fburl != null) { if (fburl.Contains("group/")) { InitGroup(); } } }; UrlPriorityDelegate = x => (x.Contains("/ajax/") ? -1000 : 0) + x.Length; root = ("https://www.facebook.com" + root.PathAndQuery).AsUri(); this.ShouldScrape = (url, prereq) => { if (url.IsHostedOn("fbcdn.net") && url.AbsolutePath.EndsWith(".js")) { return(false); } if (url.IsHostedOn("fbcdn.net") && url.AbsolutePath.EndsWith(".css")) { return(false); } //if (url.IsHostedOn("fbcdn.net")) return false; if (prereq && DownloadThumbnailsAndCss) { return(true); } if (Utils.Equals(url, root)) { return(true); } if (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page) { return(true); } if (url.Fragment.StartsWith("#$")) { if (url.PathContains("GroupEntstreamPagelet")) { return(true); } if (url.PathContains("/ProfileTimelineSectionPagelet")) { return(true); } } if (url.PathStartsWith("/pages_reaction_units")) { return(true); } return(false); }; this.AddToCrawl(root); NonHtmlReceived += (url, easy, body) => { if (url.PathStartsWith("/pages_reaction_units")) { ProcessPageReactionUnitsRequest(url, body); } }; CollectAdditionalLinks += (url, page) => { if (Stop) { return(null); } if (HttpUtils.UrisEqual(url, root) || (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page)) { fburl = page.TryGetValue(":property('al:android:url')"); var sentinel = Path.Combine(DestinationDirectory, url.AbsolutePath.StartsWith("/groups/") ? "fbgroup-not-a-member" : "fberror"); if (fburl == null) { File.WriteAllText(sentinel, string.Empty, Encoding.ASCII); return(Enumerable.Empty <(Uri, Boolean)>()); } File.Delete(sentinel); fbNumericId = fburl.Capture(@"/(\d+)(\?.*)?$"); if (fburl.Contains("group/")) { InitGroup(); } else if (fburl.Contains("page/")) { var initialUrl = ReadFromExample("fbpage"); //defaultParameters = initialUrl.FragmentParameters.ToDictionary(); initialUrl.AppendQueryParameter("page_id", fbNumericId); if (UpdateUpTo != null) { refreshId = DateTime.UtcNow.ToString("yyyyMMdd"); initialUrl.AppendQueryParameter("x-refresh", refreshId); } AddToCrawl(initialUrl.Url); } else if (fburl.Contains("profile/")) { AddToCrawl(root); AddToCrawl(root + "/about?section=overview", true); AddToCrawl(root + "/about?section=education", true); AddToCrawl(root + "/about?section=living", true); AddToCrawl(root + "/about?section=contact-info", true); AddToCrawl(root + "/about?section=relationship", true); AddToCrawl(root + "/about?section=bio", true); AddToCrawl(root + "/about?section=year-overviews", true); AddToCrawl(root + "/likes", true); AddToCrawl(root + "/photos", true); AddToCrawl(root + "/friends", true); var pageletToken = page.TryGetValue(@"script:json-token('jscc_map:'):json-token('£pagelet_token')"); if (pageletToken == null) { pageletToken = page.GetValue("script:json-token('pagelet_token:')"); } AddFacebookProfileSegment(((long)((DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc)).TotalSeconds)), fbNumericId, pageletToken); } else { throw new NotSupportedException("Not supported: " + fburl); } } return(null); }; CollectAdditionalLinks += (url, page) => { if (Stop) { return(null); } //Console.WriteLine("Size: " + page.OuterHtml.Length); var html = page.OwnerDocument.IsJson() ? page.FindAll(Configuration_PageReactionUnitsSelector) : page.FindAll(Configuration_PageletExtractionSelector).Concat(new[] { RehydrateHtml(page) }); //: Configuration_PageletExtractionSelector); /* * var hasSetMainProgress = false; * * void MaybeSetMainProgress(HtmlNode y) * { * if (y != null && !hasSetMainProgress && y.TagName == "abbr") * { * var datestr = y.GetAttributeValue("title") ?? y.GetText(); * if (datestr != null) * { * hasSetMainProgress = true; * SetMainProgressStatus(datestr); * } * } * } */ // MaybeSetMainProgress(html.Select(x=>x.FindSingle("abbr")); var dates = new List <long>(); var zzz = html.SelectMany(x => { if (fburl.Contains("profile/")) { var articles = x.FindAll("[data-ftr]"); foreach (var article in articles) { var ut = GetRootArticleDate(article); if (ut != null) { dates.Add(ut.Value); } } } var zz = x.DescendantsAndSelf().Select(y => { //MaybeSetMainProgress(y); if (y.GetAttributeValue("data-ftr") != null) { var ut = GetRootArticleDate(y); if (ut != null) { dates.Add(ut.Value); } } /* * var ajaxify = y.GetAttributeValue("ajaxify"); * if (ajaxify != null && ajaxify.StartsWith("/pages_reaction_units/")) * { * * var u = new Uri("https://www.facebook.com" + ajaxify); * return (Url: u, false); * } */ var z = y.TryGetLinkUrl(); if (z != null && !z.Contains("pages_reaction_unit")) { return(Url: z, y.TagName.In("img", "script") || y.GetAttributeValue("rel")?.ToLowerFast() == "stylesheet"); } return(null, false); }).Where(y => y.Url != null).ToList(); return(zz); }).ToList(); var mindate = dates.Count == 0 ? (long?)null : dates.Count == 1 ? dates[0] : dates.Skip(1).Min(); if (mindate != null) { SetMainProgressStatus(new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc).AddSeconds(mindate.Value).ToString("MMM d, yyyy H:mm")); } var mustStop = mindate != null && IsBeforeLimitDate(mindate.Value); if (fburl.Contains("/profile")) { if (!mustStop && mindate != null) { var pageletToken = html.Select(x => x.TryGetValue("script:json-token('£jscc_map'):json-token('£pagelet_token')")).FirstOrDefault(x => x != null); if (pageletToken != null) { AddFacebookProfileSegment(mindate.Value - 1, fbNumericId, pageletToken); } } } else { if (mustStop) { Stop = true; } } /* * if (UpdateUpTo != null) * { * var qq = zzz.Where(x => x.Url.Contains("pages_reaction_unit")).Distinct().ToList(); * if (qq.Count >= 2) * { * * } * var u = zzz.FindIndex(x=>x.Url.Contains("pages_reaction_unit")); * if (u != -1) * { * var m = new LazyUri(zzz[u].Url); * m.AppendQueryParameter("x-refresh", refreshId); * zzz[u] = (m.Url, zzz[u].Item2); * } * } */ return(zzz); }; } else { this.CollectAdditionalLinks += (url, page) => { return(page.FindAll("a[href*='sectionLoadingID='],a[href*='bacr']:text-is('See More Posts')").Select(x => { var u = x.TryGetLinkUrl(); var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); sb.Append(u.GetLeftPart(UriPartial.Path)); HttpUtils.AppendQueryParameters(u.GetQueryParameters().Select(y => { var val = y.Value; if (y.Key == "timecutoff") { val = "0"; } else if (y.Key == "refid") { val = "0"; } else if (y.Key == "sectionLoadingID") { val = "x"; } else if (y.Key == "yearSectionsYears") { val = ""; } return new KeyValuePair <string, string>(y.Key, val); }), sb); HttpUtils.AppendQueryParameter("srw", "1", sb); var rewritten = ReseekableStringBuilder.GetValueAndRelease(sb); return (rewritten.AsUri(), false); })); }; this.ShouldScrape = (url, isPrerequisite) => { if (DownloadThumbnailsAndCss && isPrerequisite) { return(true); } if (url == root) { return(true); } if (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page) { return(true); } if (!url.IsHostedOn("m.facebook.com")) { return(false); } if (url.GetQueryParameter("srw") != null) { /* * if (url.Query.StartsWith("?sectionLoadingID=")) * { * // Click Show more only for individual years, not the global one. * return url.GetQueryParameter("timestart") != "0"; * } */ return(true); } if (DownloadComments || DownloadFullSizeImages) { if (UrlRuleMatcher.IsSubfolderOf(url, root)) { if (url.HasNoQueryParameters() && string.IsNullOrEmpty(url.Fragment)) { return(true); } AddToCrawl(url.GetLeftPart(UriPartial.Path)); return(false); } } return(false); }; this.AddToCrawl(root); } this.Root = root; }
public static string DeEntitize(string text) { if (text == null) { return(null); } if (text.Length == 0) { return(text); } #if !SALTARELLE StringBuilder entity = null; StringBuilder sb = null; #endif bool flag = false; HtmlEntity.ParseState parseState = HtmlEntity.ParseState.Text; for (int i = 0; i < text.Length; i++) { switch (parseState) { case HtmlEntity.ParseState.Text: { char c = text[i]; if (c == '&') { parseState = HtmlEntity.ParseState.EntityStart; } else { if (flag) { sb.Append(text[i]); } } break; } case HtmlEntity.ParseState.EntityStart: { if (!flag) { flag = true; if (sb == null) { #if SALTARELLE sb = new StringBuilder(text.Length); #else sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); #endif } else { sb.Length = 0; } sb.Append(text, 0, i - 1); if (entity == null) { #if SALTARELLE entity = new StringBuilder(10); #else entity = ReseekableStringBuilder.AcquirePooledStringBuilder(); #endif } entity.Length = 0; } char c2 = text[i]; if (c2 == ';' || c2 == '<' || c2 == '"' || c2 == '\'' || HtmlEntity.IsWhiteSpace(c2)) { bool flag2 = c2 == ';'; if (!flag2) { i--; } if (entity.Length == 0) { if (c2 == ';') { sb.Append("&;"); } else { sb.Append('&'); } } else { if (entity[0] == '#') { string text2 = entity.ToString(); try { string text3 = text2.SubstringCached(1).Trim().ToLowerFast(); int fromBase; if (text3.StartsWith("x")) { fromBase = 16; text3 = text3.SubstringCached(1); } else { fromBase = 10; } #if SALTARELLE int num = int.Parse(text3, fromBase); #else int num = Convert.ToInt32(text3, fromBase); #endif if (num > 65535) { #if SALTARELLE sb.Append(StringFromCodePoint(num)); #else sb.Append(char.ConvertFromUtf32(num)); #endif } else { sb.Append((char)num); } goto IL_235; } catch { sb.Append("&#" + text2 + ";"); goto IL_235; } goto IL_1DC; } goto IL_1DC; IL_235: entity.Remove(0, entity.Length); goto IL_24B; IL_1DC: int num2; #if SALTARELLE num2 = HtmlEntity._entityValue[entity.ToStringCached()]; if (!Script.IsNullOrUndefined(num2)) { sb.Append((char)num2); goto IL_235; } #else if (HtmlEntity._entityValue.TryGetValue(entity.ToStringCached(), out num2)) { int num3 = num2; sb.Append((char)num3); goto IL_235; } #endif sb.Append('&'); sb.Append(entity); if (flag2) { sb.Append(';'); goto IL_235; } goto IL_235; } IL_24B: parseState = HtmlEntity.ParseState.Text; } else { if (c2 == '&') { sb.Append("&" + entity); entity.Remove(0, entity.Length); } else { entity.Append(text[i]); if (entity.Length > HtmlEntity._maxEntitySize) { parseState = HtmlEntity.ParseState.Text; sb.Append("&" + entity); entity.Remove(0, entity.Length); } } } break; } } } if (parseState == HtmlEntity.ParseState.EntityStart && flag) { sb.Append("&" + entity); } if (!flag) { return(text); } return(sb.ToStringCached()); }
public static string GetPathInternal(string root, Uri url, string contentType = null, int pathComponentsToKeepAtRoot = -1) { var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); if (root != null) { sb.Append(root); sb.Replace('\\', '/'); if (sb[sb.Length - 1] != '/') { sb.Append('/'); } } sb.Append(url.Authority.Replace(":", "_")); sb.Append('/'); var pathComponents = url.AbsolutePath.SplitFast('/', StringSplitOptions.RemoveEmptyEntries); if (pathComponentsToKeepAtRoot != -1) { if (pathComponentsToKeepAtRoot < pathComponents.Length) { var last = string.Join("-", pathComponents.Skip(pathComponentsToKeepAtRoot)); pathComponents = pathComponents.Take(pathComponentsToKeepAtRoot).Concat(new[] { last }).ToArray(); } } if (pathComponents.Length == 0 && string.IsNullOrEmpty(url.Query)) { sb.Append("index"); } else { for (int i = 0; i < pathComponents.Length; i++) { if (i != 0) { sb.Append('/'); } var v = Uri.UnescapeDataString(pathComponents[i].Trim().Replace('+', ' ')); if (v == "..") { v = "،،"; } AppendEscaped(sb, v); } if (!string.IsNullOrEmpty(url.Query)) { AppendEscaped(sb, url.Query); } if (!string.IsNullOrEmpty(url.Fragment)) { AppendEscaped(sb, url.Fragment); } } string extension = null; var ext = GetExtension(url); if (ext != null) { if (Configuration_KnownExtensions.Contains(ext)) { extension = ext; } } if (contentType != null) { if (contentType.Contains("/html")) { extension = null; } else if (extension == null) { extension = MimeToExtension(contentType); } } if (extension == null) { extension = ".html"; } if (!( EndsWith(sb, extension) || (extension == ".html" && EndsWith(sb, ".htm")) )) { sb.Append(extension); } return(ReseekableStringBuilder.GetValueAndRelease(sb)); }
public static string RemoveAccentMarksAndToLower(string text) { var hasUppers = false; var hasHigh = false; for (int i = 0; i < text.Length; i++) { var ch = text[i]; if (ch >= '\xC0' /* À */) { hasHigh = true; break; } if (ch >= 'A' && ch <= 'Z') { hasUppers = true; } } if (!hasHigh && !hasUppers) { return(text); } if (!hasHigh) { return(text.ToLowerFast()); } #if SALTARELLE || NETFX_CORE var sb = new StringBuilder(); for (int i = 0; i < text.Length; i++) { var ch = text[i]; #if SALTARELLE ch = ch.ToString().ToLower()[0]; #else if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark) { continue; } ch = char.ToLowerInvariant(ch); #endif var idx = Accents.IndexOf(ch); if (idx != -1) { ch = Removed[idx]; } sb.Append(ch); } return(sb.ToString()); #else //var tempBytes = Encoding.GetEncoding("latin2").GetBytes(text); //return Encoding.UTF8.GetString(tempBytes, 0, tempBytes.Length).ToLower(); //#else #if NET35 var normalizedString = text.Normalize(NormalizationForm.FormD); #else var normalizedString = NormalizeStringFunction != null?NormalizeStringFunction(text, NormalizationForm.FormD) : text.Normalize(NormalizationForm.FormD); #endif StringBuilder stringBuilder = ReseekableStringBuilder.AcquirePooledStringBuilder(); for (int i = 0; i < normalizedString.Length; i++) { char c = normalizedString[i]; if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark) { stringBuilder.Append(c); } } return(ReseekableStringBuilder.GetValueAndRelease(stringBuilder).ToLowerFast()); #endif }