public void SECDataConvert_ShouldParseSingleOrMultipleFilingValuesFields() { var single = File.ReadAllText(Path.Combine("TestData", "sec_report_raw_single.xml")); var multiple = File.ReadAllText(Path.Combine("TestData", "sec_report_raw_multiple.xml")); var factory = new SECReportFactory(); Assert.DoesNotThrow(() => factory.CreateSECReport(single)); Assert.DoesNotThrow(() => factory.CreateSECReport(multiple)); }
public void SECReportFactory_CreatesProperReportType() { var factory = new SECReportFactory(); var report8k = factory.CreateSECReport(_xmlForm8k) as SECReport8K; var report10k = factory.CreateSECReport(_xmlForm10k) as SECReport10K; var report10q = factory.CreateSECReport(_xmlForm10q) as SECReport10Q; Assert.NotNull(report8k); Assert.NotNull(report10k); Assert.NotNull(report10q); }
/// <summary> /// Converts the data from raw format (*.nz.tar.gz) to json files consumable by LEAN /// </summary> /// <param name="startDate">Starting date to start process files</param> /// <param name="endDate">Ending date to stop processing files</param> public void Process(DateTime startDate, DateTime endDate) { Parallel.ForEach( Directory.GetFiles(RawSource, "*.nc.tar.gz", SearchOption.AllDirectories).ToList(), rawFile => { // GetFileNameWithoutExtension only strips the first extension from the name. var fileDate = Path.GetFileName(rawFile).Split('.')[0]; var extractDataPath = Path.Combine(RawSource, fileDate); DateTime currentDate; if (!DateTime.TryParseExact(fileDate, "yyyyMMdd", CultureInfo.InvariantCulture, DateTimeStyles.None, out currentDate)) { throw new Exception($"Unable to parse date from file {rawFile}. Filename we attempted to parse: {fileDate}"); } // Only process files within start and end bounds if (currentDate < startDate || currentDate > endDate) { return; } using (var data = File.OpenRead(rawFile)) { using (var archive = TarArchive.CreateInputTarArchive(new GZipInputStream(data))) { Directory.CreateDirectory(extractDataPath); archive.ExtractContents(extractDataPath); Log.Trace($"SECDataConverter.Process(): Extracted SEC data to path {extractDataPath}"); } } // For the meantime, let's only process .nc files, and deal with correction files later. Parallel.ForEach( Directory.GetFiles(extractDataPath, "*.nc", SearchOption.AllDirectories), rawReportFilePath => { // Avoid processing files greater than MaxFileSize megabytes if (MaxFileSize < new FileInfo(rawReportFilePath).Length) { Log.Trace($"SECDataConverter.Process(): File {rawReportFilePath} is too large to process. Continuing..."); return; } var factory = new SECReportFactory(); var xmlText = new StringBuilder(); // We need to escape any nested XML to ensure our deserialization happens smoothly var parsingText = false; foreach (var line in File.ReadLines(rawReportFilePath)) { var newTextLine = line; var currentTagName = GetTagNameFromLine(newTextLine); // This tag is present rarely in SEC reports, but is unclosed without value when encountered. // Verified by searching with ripgrep for "CONFIRMING-COPY" if (currentTagName == "CONFIRMING-COPY") { return; } // Don't encode the closing tag if (currentTagName == "/TEXT") { parsingText = false; } // To ensure that we can serialize/deserialize data with hours, minutes, seconds if (currentTagName == "FILING-DATE" || currentTagName == "PERIOD" || currentTagName == "DATE-OF-FILING-CHANGE" || currentTagName == "DATE-CHANGED") { newTextLine = $"{newTextLine.TrimEnd()} 00:00:00"; } // Encode all contents inside tags to prevent errors in XML parsing. // The json deserializer will convert these values back to their original form if (!parsingText && HasValue(newTextLine)) { newTextLine = $"<{currentTagName}>{SecurityElement.Escape(GetTagValueFromLine(newTextLine))}</{currentTagName}>"; } // Escape all contents inside TEXT tags else if (parsingText) { newTextLine = SecurityElement.Escape(newTextLine); } // Don't encode the opening tag if (currentTagName == "TEXT") { parsingText = true; } xmlText.AppendLine(newTextLine); } ISECReport report; try { report = factory.CreateSECReport(xmlText.ToString()); } catch (DataException e) { Log.Trace($"SECDataConverter.Process(): {e.Message}"); return; } catch (XmlException e) { Log.Error(e, $"SECDataConverter.Process(): Failed to parse XML from file path: {rawReportFilePath}"); return; } // First filer listed in SEC report is usually the company listed on stock exchanges var companyCik = report.Report.Filers.First().CompanyData.Cik; // Some companies can operate under two tickers, but have the same CIK. // Don't bother continuing if we don't find any tickers for the given CIK List <string> tickers; if (!CikTicker.TryGetValue(companyCik, out tickers)) { return; } try { // There can potentially not be an index file present for the given CIK GetPublicationDate(report, companyCik); } catch (Exception e) { Log.Error(e, $"Index file not found for company {companyCik}"); } // Default to company CIK if no known ticker is found. // If we don't find a known equity in our list, the equity is probably not worth our time foreach (var ticker in tickers.Where(KnownEquities.Contains)) { var tickerReports = Reports.GetOrAdd( ticker, _ => new ConcurrentDictionary <DateTime, List <ISECReport> >() ); var reports = tickerReports.GetOrAdd( report.Report.FilingDate.Date, _ => new List <ISECReport>() ); reports.Add(report); } } ); Parallel.ForEach(Reports.Keys, ticker => { List <ISECReport> reports; if (!Reports[ticker].TryRemove(currentDate, out reports)) { return; } WriteReport(reports, ticker); } ); // This will clean up after ourselves without having to pay // the expense of deleting every single file inside the raw_data folder Directory.Delete(extractDataPath, true); } ); }
public void SECDataConverter_ShouldReturnInternalTagName(string line, string expected) { var factory = new SECReportFactory(); Assert.AreEqual(expected, SECDataConverter.GetTagNameFromLine(line)); }
public void SECDataConverter_GetsTagValueFromInput(string line, string expected) { var factory = new SECReportFactory(); Assert.AreEqual(expected, SECDataConverter.GetTagValueFromLine(line)); }
/// <summary> /// Converts the data from raw format (*.nz.tar.gz) to json files consumable by LEAN /// </summary> /// <param name="processingDate">Date to process SEC filings for</param> public void Process(DateTime processingDate) { // Process data into dictionary of CIK -> List{T} of tickers foreach (var line in File.ReadLines(Path.Combine(RawSource, "cik-ticker-mappings.txt"))) { var tickerCik = line.Split('\t'); var ticker = tickerCik[0]; // tickerCik[0] = symbol, tickerCik[1] = CIK // Note that SEC tickers come in lowercase, so we don't have to alter the ticker var cikFormatted = tickerCik[1].PadLeft(10, '0'); List <string> symbol; if (!CikTicker.TryGetValue(cikFormatted, out symbol)) { symbol = new List <string>(); CikTicker[cikFormatted] = symbol; } // SEC data list contains a null value in the ticker.txt file if (!string.IsNullOrWhiteSpace(ticker)) { symbol.Add(ticker); } } // Merge both data sources to a single CIK -> List{T} of tickers foreach (var line in File.ReadLines(Path.Combine(RawSource, "cik-ticker-mappings-rankandfile.txt"))) { var tickerInfo = line.Split('|'); var companyCik = tickerInfo[0].PadLeft(10, '0'); var companyTicker = tickerInfo[1].ToLowerInvariant(); List <string> symbol; if (!CikTicker.TryGetValue(companyCik, out symbol)) { symbol = new List <string>(); CikTicker[companyCik] = symbol; } // Add null check just in case data comes malformed if (!symbol.Contains(companyTicker) && !string.IsNullOrWhiteSpace(companyTicker)) { symbol.Add(companyTicker); } } var formattedDate = processingDate.ToStringInvariant(DateFormat.EightCharacter); var remoteRawData = new FileInfo(Path.Combine(RawSource, $"{formattedDate}.nc.tar.gz")); if (!remoteRawData.Exists) { if (Holidays.Contains(processingDate) || USHoliday.Dates.Contains(processingDate)) { Log.Trace("SECDataConverter.Process(): File is missing, but we expected it to be missing. Nothing to do."); return; } throw new Exception($"SECDataConverter.Process(): Raw data {remoteRawData} not found. No processing can be done."); } // Copy the raw data to a temp path on disk Log.Trace($"SECDataConverter.Process(): Copying raw data locally..."); var tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToStringInvariant(null)); var localRawData = remoteRawData.CopyTo(tempPath); Log.Trace($"SECDataConverter.Process(): Copied raw data from {remoteRawData.FullName} - to: {tempPath}"); Log.Trace($"SECDataConverter.Process(): Start processing..."); var ncFilesRead = 0; var startingTime = DateTime.Now; var loopStartingTime = startingTime; // For the meantime, let's only process .nc files, and deal with correction files later. Parallel.ForEach( Compression.UnTar(localRawData.OpenRead(), isTarGz: true).Where(kvp => kvp.Key.EndsWith(".nc")), new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount / 2 }, rawReportFilePath => { var factory = new SECReportFactory(); var xmlText = new StringBuilder(); // We need to escape any nested XML to ensure our deserialization happens smoothly var parsingText = false; // SEC data is line separated by UNIX style line endings. No need to worry about a carriage line here. foreach (var line in Encoding.UTF8.GetString(rawReportFilePath.Value).Split('\n')) { var newTextLine = line; var currentTagName = GetTagNameFromLine(newTextLine); // This tag is present rarely in SEC reports, but is unclosed without value when encountered. // Verified by searching with ripgrep for "CONFIRMING-COPY" // // Sometimes, ASSIGNED-SIC contains no value and causes errors. Check to make sure that when // we encounter that tag we check if it has a value. // // "Appearance of the <FLAWED> tag in // an EX-27 document header signals unreliable tagging within the // following document text stream; however, in the absence of a // <FLAWED> tag, tagging is still not guaranteed to be complete // because of allowance in the financial data specifications for // omitted tags when the submission also includes a financial data // schedule of article type CT." if (currentTagName == "CONFIRMING-COPY" || (currentTagName == "ASSIGNED-SIC" && !HasValue(line)) || currentTagName == "FLAWED") { continue; } // Indicates that the form is a paper submission and that the current file has no contents if (currentTagName == "PAPER") { continue; } // Don't encode the closing tag if (currentTagName == "/TEXT") { parsingText = false; } // To ensure that we can serialize/deserialize data with hours, minutes, seconds if (currentTagName == "FILING-DATE" || currentTagName == "PERIOD" || currentTagName == "DATE-OF-FILING-CHANGE" || currentTagName == "DATE-CHANGED") { newTextLine = $"{newTextLine.TrimEnd()} 00:00:00"; } // Encode all contents inside tags to prevent errors in XML parsing. // The json deserializer will convert these values back to their original form if (!parsingText && HasValue(newTextLine)) { newTextLine = $"<{currentTagName}>{SecurityElement.Escape(GetTagValueFromLine(newTextLine))}</{currentTagName}>"; } // Escape all contents inside TEXT tags else if (parsingText) { newTextLine = SecurityElement.Escape(newTextLine); } // Don't encode the opening tag if (currentTagName == "TEXT") { parsingText = true; } xmlText.AppendLine(newTextLine); } var counter = Interlocked.Increment(ref ncFilesRead); if (counter % 100 == 0) { var interval = DateTime.Now - loopStartingTime; Log.Trace($"SECDataConverter.Process(): {counter.ToStringInvariant()} nc files read at {(100 / interval.TotalMinutes).ToStringInvariant("N2")} files/min."); loopStartingTime = DateTime.Now; } ISECReport report; try { report = factory.CreateSECReport(xmlText.ToString()); } // Ignore unsupported form types for now catch (DataException) { return; } catch (XmlException e) { Log.Error(e, $"SECDataConverter.Process(): Failed to parse XML from file: {rawReportFilePath.Key}"); return; } catch (Exception e) { Log.Error(e, "SECDataConverter.Process(): Unknown error encountered"); return; } // First filer listed in SEC report is usually the company listed on stock exchanges var companyCik = report.Report.Filers.First().CompanyData.Cik; // Some companies can operate under two tickers, but have the same CIK. // Don't bother continuing if we don't find any tickers for the given CIK List <string> tickers; if (!CikTicker.TryGetValue(companyCik, out tickers)) { return; } if (!File.Exists(Path.Combine(RawSource, "indexes", $"{companyCik}.json"))) { Log.Error($"SECDataConverter.Process(): {report.Report.FilingDate.ToStringInvariant("yyyy-MM-dd")}:{rawReportFilePath.Key} - Failed to find index file for ticker {tickers.FirstOrDefault()} with CIK: {companyCik}"); return; } try { // The index file can potentially be corrupted GetPublicationDate(report, companyCik); } catch (Exception e) { Log.Error(e, $"SECDataConverter.Process(): {report.Report.FilingDate.ToStringInvariant("yyyy-MM-dd")}:{rawReportFilePath.Key} - Index file loading failed for ticker: {tickers.FirstOrDefault()} with CIK: {companyCik} even though it exists"); } // Default to company CIK if no known ticker is found. // If the equity is not does not resolve to a map file or // it is not found in the map files, we skip writing it. foreach (var ticker in tickers) { var tickerMapFile = _mapFileResolver.ResolveMapFile(ticker, processingDate); if (!tickerMapFile.Any()) { Log.Trace($"SECDataConverter.Process(): {processingDate.ToStringInvariant()} - Failed to find map file for ticker: {ticker}"); continue; } // Map the current ticker to the ticker it was in the past using the map file system var mappedTicker = tickerMapFile.GetMappedSymbol(processingDate); // If no suitable date is found for the symbol in the map file, we skip writing the data if (string.IsNullOrEmpty(mappedTicker)) { Log.Trace($"SECDataConverter.Process(): {processingDate.ToStringInvariant()} - Failed to find mapped symbol for ticker: {ticker}"); continue; } var tickerReports = Reports.GetOrAdd( mappedTicker, _ => new ConcurrentDictionary <DateTime, List <ISECReport> >() ); var reports = tickerReports.GetOrAdd( report.Report.FilingDate.Date, _ => new List <ISECReport>() ); reports.Add(report); } } ); Log.Trace($"SECDataConverter.Process(): {ncFilesRead} nc files read finished in {(DateTime.Now - startingTime).ToStringInvariant("g")}."); Parallel.ForEach( Reports.Keys, ticker => { List <ISECReport> reports; if (!Reports[ticker].TryRemove(processingDate, out reports)) { return; } WriteReport(reports, ticker); } ); // Delete the raw data we copied to the temp folder File.Delete(tempPath); }