Exemple #1
0
        public void SECDataConvert_ShouldParseSingleOrMultipleFilingValuesFields()
        {
            var single   = File.ReadAllText(Path.Combine("TestData", "sec_report_raw_single.xml"));
            var multiple = File.ReadAllText(Path.Combine("TestData", "sec_report_raw_multiple.xml"));
            var factory  = new SECReportFactory();

            Assert.DoesNotThrow(() => factory.CreateSECReport(single));
            Assert.DoesNotThrow(() => factory.CreateSECReport(multiple));
        }
Exemple #2
0
        public void SECReportFactory_CreatesProperReportType()
        {
            var factory   = new SECReportFactory();
            var report8k  = factory.CreateSECReport(_xmlForm8k) as SECReport8K;
            var report10k = factory.CreateSECReport(_xmlForm10k) as SECReport10K;
            var report10q = factory.CreateSECReport(_xmlForm10q) as SECReport10Q;

            Assert.NotNull(report8k);
            Assert.NotNull(report10k);
            Assert.NotNull(report10q);
        }
Exemple #3
0
        /// <summary>
        /// Converts the data from raw format (*.nz.tar.gz) to json files consumable by LEAN
        /// </summary>
        /// <param name="startDate">Starting date to start process files</param>
        /// <param name="endDate">Ending date to stop processing files</param>
        public void Process(DateTime startDate, DateTime endDate)
        {
            Parallel.ForEach(
                Directory.GetFiles(RawSource, "*.nc.tar.gz", SearchOption.AllDirectories).ToList(),
                rawFile =>
            {
                // GetFileNameWithoutExtension only strips the first extension from the name.
                var fileDate        = Path.GetFileName(rawFile).Split('.')[0];
                var extractDataPath = Path.Combine(RawSource, fileDate);

                DateTime currentDate;
                if (!DateTime.TryParseExact(fileDate, "yyyyMMdd", CultureInfo.InvariantCulture, DateTimeStyles.None, out currentDate))
                {
                    throw new Exception($"Unable to parse date from file {rawFile}. Filename we attempted to parse: {fileDate}");
                }

                // Only process files within start and end bounds
                if (currentDate < startDate || currentDate > endDate)
                {
                    return;
                }

                using (var data = File.OpenRead(rawFile))
                {
                    using (var archive = TarArchive.CreateInputTarArchive(new GZipInputStream(data)))
                    {
                        Directory.CreateDirectory(extractDataPath);
                        archive.ExtractContents(extractDataPath);

                        Log.Trace($"SECDataConverter.Process(): Extracted SEC data to path {extractDataPath}");
                    }
                }

                // For the meantime, let's only process .nc files, and deal with correction files later.
                Parallel.ForEach(
                    Directory.GetFiles(extractDataPath, "*.nc", SearchOption.AllDirectories),
                    rawReportFilePath =>
                {
                    // Avoid processing files greater than MaxFileSize megabytes
                    if (MaxFileSize < new FileInfo(rawReportFilePath).Length)
                    {
                        Log.Trace($"SECDataConverter.Process(): File {rawReportFilePath} is too large to process. Continuing...");
                        return;
                    }

                    var factory = new SECReportFactory();
                    var xmlText = new StringBuilder();

                    // We need to escape any nested XML to ensure our deserialization happens smoothly
                    var parsingText = false;

                    foreach (var line in File.ReadLines(rawReportFilePath))
                    {
                        var newTextLine    = line;
                        var currentTagName = GetTagNameFromLine(newTextLine);

                        // This tag is present rarely in SEC reports, but is unclosed without value when encountered.
                        // Verified by searching with ripgrep for "CONFIRMING-COPY"
                        if (currentTagName == "CONFIRMING-COPY")
                        {
                            return;
                        }

                        // Don't encode the closing tag
                        if (currentTagName == "/TEXT")
                        {
                            parsingText = false;
                        }

                        // To ensure that we can serialize/deserialize data with hours, minutes, seconds
                        if (currentTagName == "FILING-DATE" || currentTagName == "PERIOD" ||
                            currentTagName == "DATE-OF-FILING-CHANGE" || currentTagName == "DATE-CHANGED")
                        {
                            newTextLine = $"{newTextLine.TrimEnd()} 00:00:00";
                        }

                        // Encode all contents inside tags to prevent errors in XML parsing.
                        // The json deserializer will convert these values back to their original form
                        if (!parsingText && HasValue(newTextLine))
                        {
                            newTextLine =
                                $"<{currentTagName}>{SecurityElement.Escape(GetTagValueFromLine(newTextLine))}</{currentTagName}>";
                        }
                        // Escape all contents inside TEXT tags
                        else if (parsingText)
                        {
                            newTextLine = SecurityElement.Escape(newTextLine);
                        }

                        // Don't encode the opening tag
                        if (currentTagName == "TEXT")
                        {
                            parsingText = true;
                        }

                        xmlText.AppendLine(newTextLine);
                    }

                    ISECReport report;
                    try
                    {
                        report = factory.CreateSECReport(xmlText.ToString());
                    }
                    catch (DataException e)
                    {
                        Log.Trace($"SECDataConverter.Process(): {e.Message}");
                        return;
                    }
                    catch (XmlException e)
                    {
                        Log.Error(e, $"SECDataConverter.Process(): Failed to parse XML from file path: {rawReportFilePath}");
                        return;
                    }

                    // First filer listed in SEC report is usually the company listed on stock exchanges
                    var companyCik = report.Report.Filers.First().CompanyData.Cik;

                    // Some companies can operate under two tickers, but have the same CIK.
                    // Don't bother continuing if we don't find any tickers for the given CIK
                    List <string> tickers;
                    if (!CikTicker.TryGetValue(companyCik, out tickers))
                    {
                        return;
                    }

                    try
                    {
                        // There can potentially not be an index file present for the given CIK
                        GetPublicationDate(report, companyCik);
                    }
                    catch (Exception e)
                    {
                        Log.Error(e, $"Index file not found for company {companyCik}");
                    }

                    // Default to company CIK if no known ticker is found.
                    // If we don't find a known equity in our list, the equity is probably not worth our time
                    foreach (var ticker in tickers.Where(KnownEquities.Contains))
                    {
                        var tickerReports = Reports.GetOrAdd(
                            ticker,
                            _ => new ConcurrentDictionary <DateTime, List <ISECReport> >()
                            );
                        var reports = tickerReports.GetOrAdd(
                            report.Report.FilingDate.Date,
                            _ => new List <ISECReport>()
                            );

                        reports.Add(report);
                    }
                }
                    );

                Parallel.ForEach(Reports.Keys, ticker =>
                {
                    List <ISECReport> reports;
                    if (!Reports[ticker].TryRemove(currentDate, out reports))
                    {
                        return;
                    }

                    WriteReport(reports, ticker);
                }
                                 );

                // This will clean up after ourselves without having to pay
                // the expense of deleting every single file inside the raw_data folder
                Directory.Delete(extractDataPath, true);
            }
                );
        }
Exemple #4
0
        public void SECDataConverter_ShouldReturnInternalTagName(string line, string expected)
        {
            var factory = new SECReportFactory();

            Assert.AreEqual(expected, SECDataConverter.GetTagNameFromLine(line));
        }
Exemple #5
0
        public void SECDataConverter_GetsTagValueFromInput(string line, string expected)
        {
            var factory = new SECReportFactory();

            Assert.AreEqual(expected, SECDataConverter.GetTagValueFromLine(line));
        }
Exemple #6
0
        /// <summary>
        /// Converts the data from raw format (*.nz.tar.gz) to json files consumable by LEAN
        /// </summary>
        /// <param name="processingDate">Date to process SEC filings for</param>
        public void Process(DateTime processingDate)
        {
            // Process data into dictionary of CIK -> List{T} of tickers
            foreach (var line in File.ReadLines(Path.Combine(RawSource, "cik-ticker-mappings.txt")))
            {
                var tickerCik = line.Split('\t');
                var ticker    = tickerCik[0];
                // tickerCik[0] = symbol, tickerCik[1] = CIK
                // Note that SEC tickers come in lowercase, so we don't have to alter the ticker
                var cikFormatted = tickerCik[1].PadLeft(10, '0');

                List <string> symbol;
                if (!CikTicker.TryGetValue(cikFormatted, out symbol))
                {
                    symbol = new List <string>();
                    CikTicker[cikFormatted] = symbol;
                }

                // SEC data list contains a null value in the ticker.txt file
                if (!string.IsNullOrWhiteSpace(ticker))
                {
                    symbol.Add(ticker);
                }
            }

            // Merge both data sources to a single CIK -> List{T} of tickers
            foreach (var line in File.ReadLines(Path.Combine(RawSource, "cik-ticker-mappings-rankandfile.txt")))
            {
                var tickerInfo = line.Split('|');

                var companyCik    = tickerInfo[0].PadLeft(10, '0');
                var companyTicker = tickerInfo[1].ToLowerInvariant();

                List <string> symbol;
                if (!CikTicker.TryGetValue(companyCik, out symbol))
                {
                    symbol = new List <string>();
                    CikTicker[companyCik] = symbol;
                }
                // Add null check just in case data comes malformed
                if (!symbol.Contains(companyTicker) && !string.IsNullOrWhiteSpace(companyTicker))
                {
                    symbol.Add(companyTicker);
                }
            }

            var formattedDate = processingDate.ToStringInvariant(DateFormat.EightCharacter);
            var remoteRawData = new FileInfo(Path.Combine(RawSource, $"{formattedDate}.nc.tar.gz"));

            if (!remoteRawData.Exists)
            {
                if (Holidays.Contains(processingDate) || USHoliday.Dates.Contains(processingDate))
                {
                    Log.Trace("SECDataConverter.Process(): File is missing, but we expected it to be missing. Nothing to do.");
                    return;
                }
                throw new Exception($"SECDataConverter.Process(): Raw data {remoteRawData} not found. No processing can be done.");
            }

            // Copy the raw data to a temp path on disk
            Log.Trace($"SECDataConverter.Process(): Copying raw data locally...");
            var tempPath     = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToStringInvariant(null));
            var localRawData = remoteRawData.CopyTo(tempPath);

            Log.Trace($"SECDataConverter.Process(): Copied raw data from {remoteRawData.FullName} - to: {tempPath}");

            Log.Trace($"SECDataConverter.Process(): Start processing...");

            var ncFilesRead      = 0;
            var startingTime     = DateTime.Now;
            var loopStartingTime = startingTime;

            // For the meantime, let's only process .nc files, and deal with correction files later.
            Parallel.ForEach(
                Compression.UnTar(localRawData.OpenRead(), isTarGz: true).Where(kvp => kvp.Key.EndsWith(".nc")),
                new ParallelOptions {
                MaxDegreeOfParallelism = Environment.ProcessorCount / 2
            },
                rawReportFilePath =>
            {
                var factory = new SECReportFactory();
                var xmlText = new StringBuilder();

                // We need to escape any nested XML to ensure our deserialization happens smoothly
                var parsingText = false;

                // SEC data is line separated by UNIX style line endings. No need to worry about a carriage line here.
                foreach (var line in Encoding.UTF8.GetString(rawReportFilePath.Value).Split('\n'))
                {
                    var newTextLine    = line;
                    var currentTagName = GetTagNameFromLine(newTextLine);

                    // This tag is present rarely in SEC reports, but is unclosed without value when encountered.
                    // Verified by searching with ripgrep for "CONFIRMING-COPY"
                    //
                    // Sometimes, ASSIGNED-SIC contains no value and causes errors. Check to make sure that when
                    // we encounter that tag we check if it has a value.
                    //
                    // "Appearance of the <FLAWED> tag  in
                    //  an EX-27  document header signals unreliable tagging within  the
                    //  following  document text stream; however, in  the absence  of a
                    //  <FLAWED>  tag, tagging is still not guaranteed to  be complete
                    //  because of  allowance in the financial data specifications  for
                    //  omitted tags when the submission also includes a financial  data
                    //  schedule  of article type CT."
                    if (currentTagName == "CONFIRMING-COPY" || (currentTagName == "ASSIGNED-SIC" && !HasValue(line)) || currentTagName == "FLAWED")
                    {
                        continue;
                    }

                    // Indicates that the form is a paper submission and that the current file has no contents
                    if (currentTagName == "PAPER")
                    {
                        continue;
                    }

                    // Don't encode the closing tag
                    if (currentTagName == "/TEXT")
                    {
                        parsingText = false;
                    }

                    // To ensure that we can serialize/deserialize data with hours, minutes, seconds
                    if (currentTagName == "FILING-DATE" || currentTagName == "PERIOD" ||
                        currentTagName == "DATE-OF-FILING-CHANGE" || currentTagName == "DATE-CHANGED")
                    {
                        newTextLine = $"{newTextLine.TrimEnd()} 00:00:00";
                    }

                    // Encode all contents inside tags to prevent errors in XML parsing.
                    // The json deserializer will convert these values back to their original form
                    if (!parsingText && HasValue(newTextLine))
                    {
                        newTextLine =
                            $"<{currentTagName}>{SecurityElement.Escape(GetTagValueFromLine(newTextLine))}</{currentTagName}>";
                    }
                    // Escape all contents inside TEXT tags
                    else if (parsingText)
                    {
                        newTextLine = SecurityElement.Escape(newTextLine);
                    }

                    // Don't encode the opening tag
                    if (currentTagName == "TEXT")
                    {
                        parsingText = true;
                    }

                    xmlText.AppendLine(newTextLine);
                }

                var counter = Interlocked.Increment(ref ncFilesRead);
                if (counter % 100 == 0)
                {
                    var interval = DateTime.Now - loopStartingTime;
                    Log.Trace($"SECDataConverter.Process(): {counter.ToStringInvariant()} nc files read at {(100 / interval.TotalMinutes).ToStringInvariant("N2")} files/min.");
                    loopStartingTime = DateTime.Now;
                }

                ISECReport report;
                try
                {
                    report = factory.CreateSECReport(xmlText.ToString());
                }
                // Ignore unsupported form types for now
                catch (DataException)
                {
                    return;
                }
                catch (XmlException e)
                {
                    Log.Error(e, $"SECDataConverter.Process(): Failed to parse XML from file: {rawReportFilePath.Key}");
                    return;
                }
                catch (Exception e)
                {
                    Log.Error(e, "SECDataConverter.Process(): Unknown error encountered");
                    return;
                }

                // First filer listed in SEC report is usually the company listed on stock exchanges
                var companyCik = report.Report.Filers.First().CompanyData.Cik;

                // Some companies can operate under two tickers, but have the same CIK.
                // Don't bother continuing if we don't find any tickers for the given CIK
                List <string> tickers;
                if (!CikTicker.TryGetValue(companyCik, out tickers))
                {
                    return;
                }

                if (!File.Exists(Path.Combine(RawSource, "indexes", $"{companyCik}.json")))
                {
                    Log.Error($"SECDataConverter.Process(): {report.Report.FilingDate.ToStringInvariant("yyyy-MM-dd")}:{rawReportFilePath.Key} - Failed to find index file for ticker {tickers.FirstOrDefault()} with CIK: {companyCik}");
                    return;
                }

                try
                {
                    // The index file can potentially be corrupted
                    GetPublicationDate(report, companyCik);
                }
                catch (Exception e)
                {
                    Log.Error(e, $"SECDataConverter.Process(): {report.Report.FilingDate.ToStringInvariant("yyyy-MM-dd")}:{rawReportFilePath.Key} - Index file loading failed for ticker: {tickers.FirstOrDefault()} with CIK: {companyCik} even though it exists");
                }

                // Default to company CIK if no known ticker is found.
                // If the equity is not does not resolve to a map file or
                // it is not found in the map files, we skip writing it.
                foreach (var ticker in tickers)
                {
                    var tickerMapFile = _mapFileResolver.ResolveMapFile(ticker, processingDate);
                    if (!tickerMapFile.Any())
                    {
                        Log.Trace($"SECDataConverter.Process(): {processingDate.ToStringInvariant()} - Failed to find map file for ticker: {ticker}");
                        continue;
                    }

                    // Map the current ticker to the ticker it was in the past using the map file system
                    var mappedTicker = tickerMapFile.GetMappedSymbol(processingDate);

                    // If no suitable date is found for the symbol in the map file, we skip writing the data
                    if (string.IsNullOrEmpty(mappedTicker))
                    {
                        Log.Trace($"SECDataConverter.Process(): {processingDate.ToStringInvariant()} - Failed to find mapped symbol for ticker: {ticker}");
                        continue;
                    }

                    var tickerReports = Reports.GetOrAdd(
                        mappedTicker,
                        _ => new ConcurrentDictionary <DateTime, List <ISECReport> >()
                        );
                    var reports = tickerReports.GetOrAdd(
                        report.Report.FilingDate.Date,
                        _ => new List <ISECReport>()
                        );

                    reports.Add(report);
                }
            }
                );

            Log.Trace($"SECDataConverter.Process(): {ncFilesRead} nc files read finished in {(DateTime.Now - startingTime).ToStringInvariant("g")}.");

            Parallel.ForEach(
                Reports.Keys,
                ticker =>
            {
                List <ISECReport> reports;
                if (!Reports[ticker].TryRemove(processingDate, out reports))
                {
                    return;
                }

                WriteReport(reports, ticker);
            }
                );

            // Delete the raw data we copied to the temp folder
            File.Delete(tempPath);
        }