Ejemplo n.º 1
0
        public List <WorldOfficialGoldHoldingReport_Raw> Run()
        {
            List <string> textLines;
            var           t = typeof(WorldOfficialGoldHoldingReportsScraper_UpTo2000);

            using (var stream = t.Assembly.GetManifestResourceStream(t, "annual_time_series_on_world_official_gold_reserves.pdf"))
                using (var pdfReader = new PdfReader(stream))
                {
                    var textExtractor = new TextExtractor();
                    for (int i = 1; i <= pdfReader.NumberOfPages; i++)
                    {
                        PdfTextExtractor.GetTextFromPage(pdfReader, i, textExtractor);
                        textExtractor.NextPage();
                    }
                    textExtractor.Lines.Sort((l1, l2) =>
                    {
                        int c = l1.PageNumber.CompareTo(l2.PageNumber);
                        if (c != 0)
                        {
                            return(c);
                        }
                        return(l2.Y.CompareTo(l1.Y));
                    });
                    textLines = textExtractor.Lines.Select(l => l.Value).ToList();
                }
            List <WorldOfficialGoldHoldingReport_Raw> reports1 = new List <WorldOfficialGoldHoldingReport_Raw>();
            List <WorldOfficialGoldHoldingReport_Raw> reports2 = null;
            int           tableNo1            = 0;
            List <string> textLinesNonTabular = new List <string>();

            for (int i = 0; i < textLines.Count; i++)
            {
                var m1 = s_regexTableInfo.Match(textLines[i]);
                if (m1.Success)
                {
                    int tableNo2 = int.Parse(m1.Groups[1].Captures.Cast <Capture>().Single().Value, NumberStyles.None, NumberFormatInfo.InvariantInfo);
                    if (tableNo1 != tableNo2)
                    {
                        if (3 <= tableNo2)
                        {
                            goto winning;
                        }
                        tableNo1 = tableNo2;
                    }
                    if (++i == textLines.Count)
                    {
                        goto winning;
                    }
                    var yearPerColumn = textLines[i].Split(new char[] { ' ', }, StringSplitOptions.RemoveEmptyEntries)
                                        .Select(s =>
                    {
                        var m2 = s_regexNote.Match(s);
                        return(int.Parse(m2.Success
                                ? s.Substring(0, m2.Index)
                                : s, NumberStyles.AllowLeadingWhite, NumberFormatInfo.InvariantInfo));
                    }).ToList();
                    if (yearPerColumn.Any(year => 2000 <= year))
                    {
                        throw new Exception();
                    }
                    if (reports2 != null)
                    {
                        reports1.AddRange(reports2);
                    }
                    reports2 = yearPerColumn.Select(year => new WorldOfficialGoldHoldingReport_Raw()
                    {
                        DataTimePoint    = new DateTime(year, 1, 1, 0, 0, 0, DateTimeKind.Unspecified),
                        PublishTimePoint = new DateTime(2011, 8, 10, 0, 0, 0, DateTimeKind.Unspecified),
                        Rows             = new List <WorldOfficialGoldHoldingReportRow_Raw>(),
                    }).ToList();
                    continue;
                }
                var reportRowPerCell = TryGetCells(textLines[i], reports2.Count);
                if (reportRowPerCell != null)
                {
                    for (int j = 0; j < reports2.Count; j++)
                    {
                        reports2[j].Rows.Add(reportRowPerCell[j]);
                    }
                    continue;
                }
                textLinesNonTabular.Add(textLines[i]);
            }
winning:
            if (reports2 != null)
            {
                reports1.AddRange(reports2);
            }
            return(reports1);
        }