public List <WorldOfficialGoldHoldingReport_Raw> Run() { List <string> textLines; var t = typeof(WorldOfficialGoldHoldingReportsScraper_UpTo2000); using (var stream = t.Assembly.GetManifestResourceStream(t, "annual_time_series_on_world_official_gold_reserves.pdf")) using (var pdfReader = new PdfReader(stream)) { var textExtractor = new TextExtractor(); for (int i = 1; i <= pdfReader.NumberOfPages; i++) { PdfTextExtractor.GetTextFromPage(pdfReader, i, textExtractor); textExtractor.NextPage(); } textExtractor.Lines.Sort((l1, l2) => { int c = l1.PageNumber.CompareTo(l2.PageNumber); if (c != 0) { return(c); } return(l2.Y.CompareTo(l1.Y)); }); textLines = textExtractor.Lines.Select(l => l.Value).ToList(); } List <WorldOfficialGoldHoldingReport_Raw> reports1 = new List <WorldOfficialGoldHoldingReport_Raw>(); List <WorldOfficialGoldHoldingReport_Raw> reports2 = null; int tableNo1 = 0; List <string> textLinesNonTabular = new List <string>(); for (int i = 0; i < textLines.Count; i++) { var m1 = s_regexTableInfo.Match(textLines[i]); if (m1.Success) { int tableNo2 = int.Parse(m1.Groups[1].Captures.Cast <Capture>().Single().Value, NumberStyles.None, NumberFormatInfo.InvariantInfo); if (tableNo1 != tableNo2) { if (3 <= tableNo2) { goto winning; } tableNo1 = tableNo2; } if (++i == textLines.Count) { goto winning; } var yearPerColumn = textLines[i].Split(new char[] { ' ', }, StringSplitOptions.RemoveEmptyEntries) .Select(s => { var m2 = s_regexNote.Match(s); return(int.Parse(m2.Success ? s.Substring(0, m2.Index) : s, NumberStyles.AllowLeadingWhite, NumberFormatInfo.InvariantInfo)); }).ToList(); if (yearPerColumn.Any(year => 2000 <= year)) { throw new Exception(); } if (reports2 != null) { reports1.AddRange(reports2); } reports2 = yearPerColumn.Select(year => new WorldOfficialGoldHoldingReport_Raw() { DataTimePoint = new DateTime(year, 1, 1, 0, 0, 0, DateTimeKind.Unspecified), PublishTimePoint = new DateTime(2011, 8, 10, 0, 0, 0, DateTimeKind.Unspecified), Rows = new List <WorldOfficialGoldHoldingReportRow_Raw>(), }).ToList(); continue; } var reportRowPerCell = TryGetCells(textLines[i], reports2.Count); if (reportRowPerCell != null) { for (int j = 0; j < reports2.Count; j++) { reports2[j].Rows.Add(reportRowPerCell[j]); } continue; } textLinesNonTabular.Add(textLines[i]); } winning: if (reports2 != null) { reports1.AddRange(reports2); } return(reports1); }