Esempio n. 1
0
        /// <summary>
        /// Iterates over each equity directory and aggregates the data into the coarse file
        /// </summary>
        /// <param name="dataDirectory">The Lean /Data directory</param>
        /// <param name="ignoreMaplessSymbols">Ignore symbols without a QuantQuote map file.</param>
        public static IEnumerable <string> ProcessEquityDirectories(string dataDirectory, bool ignoreMaplessSymbols, DateTime?startDate)
        {
            var exclusions = ReadExclusionsFile(ExclusionsFile);

            var equity = Path.Combine(dataDirectory, "equity");

            foreach (var directory in Directory.EnumerateDirectories(equity))
            {
                var dailyFolder   = Path.Combine(directory, "daily");
                var mapFileFolder = Path.Combine(directory, "map_files");
                var coarseFolder  = Path.Combine(directory, "fundamental", "coarse");
                if (!Directory.Exists(coarseFolder))
                {
                    Directory.CreateDirectory(coarseFolder);
                }

                var lastProcessedDate  = startDate ?? GetLastProcessedDate(coarseFolder);
                var factorFileProvider = new LocalDiskFactorFileProvider();
                var files = ProcessDailyFolder(dailyFolder, coarseFolder, MapFileResolver.Create(mapFileFolder), factorFileProvider, exclusions, ignoreMaplessSymbols, lastProcessedDate);
                foreach (var file in files)
                {
                    yield return(file);
                }
            }
        }
Esempio n. 2
0
        public TradingEconomicsEarningsDownloader(string destinationFolder)
        {
            _fromDate          = new DateTime(1998, 1, 1);
            _toDate            = DateTime.Now;
            _destinationFolder = Path.Combine(destinationFolder, "earnings");
            _requestGate       = new RateGate(1, TimeSpan.FromSeconds(1));
            _mapFileResolver   = MapFileResolver.Create(Globals.DataFolder, Market.USA);

            Directory.CreateDirectory(_destinationFolder);
        }
Esempio n. 3
0
        /// <summary>
        /// Iterates over each equity directory and aggregates the data into the coarse file
        /// </summary>
        /// <param name="dataDirectory">The Lean /Data directory</param>
        /// <param name="ignoreMaplessSymbols">Ignore symbols without a QuantQuote map file.</param>
        public static void ProcessEquityDirectories(string dataDirectory, bool ignoreMaplessSymbols)
        {
            var exclusions = ReadExclusionsFile(ExclusionsFile);

            var equity = Path.Combine(dataDirectory, "equity");

            foreach (var directory in Directory.EnumerateDirectories(equity))
            {
                var dailyFolder   = Path.Combine(directory, "daily");
                var mapFileFolder = Path.Combine(directory, "map_files");
                var coarseFolder  = Path.Combine(directory, "fundamental", "coarse");
                if (!Directory.Exists(coarseFolder))
                {
                    Directory.CreateDirectory(coarseFolder);
                }

                ProcessDailyFolder(dailyFolder, coarseFolder, MapFileResolver.Create(mapFileFolder), exclusions, ignoreMaplessSymbols);
            }
        }
Esempio n. 4
0
        public void DoesNotEmitInvalidData()
        {
            var startTime = new DateTime(2014, 06, 06, 0, 0, 0);
            var endTime   = new DateTime(2014, 06, 09, 20, 0, 0);

            var canonicalSymbol = Symbol.Create("AAPL", SecurityType.Option, Market.USA, "?AAPL");

            var quoteCurrency = new Cash(Currencies.USD, 0, 1);
            var exchangeHours = MarketHoursDatabase.FromDataFolder().GetExchangeHours(Market.USA, canonicalSymbol, SecurityType.Option);
            var config        = new SubscriptionDataConfig(
                typeof(ZipEntryName),
                canonicalSymbol,
                Resolution.Minute,
                TimeZones.Utc,
                TimeZones.NewYork,
                true,
                false,
                false,
                false,
                TickType.Quote,
                false,
                DataNormalizationMode.Raw
                );

            var option = new Option(
                canonicalSymbol,
                exchangeHours,
                quoteCurrency,
                new OptionSymbolProperties(SymbolProperties.GetDefault(Currencies.USD)),
                ErrorCurrencyConverter.Instance,
                RegisteredSecurityDataTypesProvider.Null,
                new SecurityCache()
                );

            var fillForwardResolution = Ref.CreateReadOnly(() => Resolution.Minute.ToTimeSpan());
            Func <SubscriptionRequest, IEnumerator <BaseData>, IEnumerator <BaseData> > underlyingEnumeratorFunc =
                (req, input) =>
            {
                input = new BaseDataCollectionAggregatorEnumerator(input, req.Configuration.Symbol);
                return(new FillForwardEnumerator(
                           input,
                           option.Exchange,
                           fillForwardResolution,
                           false,
                           endTime,
                           Resolution.Minute.ToTimeSpan(),
                           TimeZones.Utc,
                           startTime));
            };
            var factory = new OptionChainUniverseSubscriptionEnumeratorFactory(underlyingEnumeratorFunc,
                                                                               MapFileResolver.Create(Globals.DataFolder, Market.USA),
                                                                               new LocalDiskFactorFileProvider(new LocalDiskMapFileProvider()));

            var request    = new SubscriptionRequest(true, null, option, config, startTime, endTime);
            var enumerator = factory.CreateEnumerator(request, new DefaultDataProvider());

            var emittedCount = 0;

            foreach (var data in enumerator.AsEnumerable())
            {
                emittedCount++;
                var optionData = data as OptionChainUniverseDataCollection;

                Assert.IsNotNull(optionData);
                Assert.IsNotNull(optionData.Underlying);
                Assert.AreNotEqual(0, optionData.Data.Count);
            }

            // 9:30 to 15:59 -> 6.5 hours * 60 => 390 minutes * 2 days = 780
            Assert.AreEqual(780, emittedCount);
        }
Esempio n. 5
0
        /// <summary>
        /// Converts the data from raw format (*.nz.tar.gz) to json files consumable by LEAN
        /// </summary>
        /// <param name="processingDate">Date to process SEC filings for</param>
        public void Process(DateTime processingDate)
        {
            // Process data into dictionary of CIK -> List{T} of tickers
            foreach (var line in File.ReadLines(Path.Combine(RawSource, "cik-ticker-mappings.txt")))
            {
                var tickerCik = line.Split('\t');
                var ticker    = tickerCik[0];
                // tickerCik[0] = symbol, tickerCik[1] = CIK
                // Note that SEC tickers come in lowercase, so we don't have to alter the ticker
                var cikFormatted = tickerCik[1].PadLeft(10, '0');

                List <string> symbol;
                if (!CikTicker.TryGetValue(cikFormatted, out symbol))
                {
                    symbol = new List <string>();
                    CikTicker[cikFormatted] = symbol;
                }

                // SEC data list contains a null value in the ticker.txt file
                if (!string.IsNullOrWhiteSpace(ticker))
                {
                    symbol.Add(ticker);
                }
            }

            // Merge both data sources to a single CIK -> List{T} of tickers
            foreach (var line in File.ReadLines(Path.Combine(RawSource, "cik-ticker-mappings-rankandfile.txt")))
            {
                var tickerInfo = line.Split('|');

                var companyCik    = tickerInfo[0].PadLeft(10, '0');
                var companyTicker = tickerInfo[1].ToLower();

                List <string> symbol;
                if (!CikTicker.TryGetValue(companyCik, out symbol))
                {
                    symbol = new List <string>();
                    CikTicker[companyCik] = symbol;
                }
                // Add null check just in case data comes malformed
                if (!symbol.Contains(companyTicker) && !string.IsNullOrWhiteSpace(companyTicker))
                {
                    symbol.Add(companyTicker);
                }
            }

            var formattedDate = processingDate.ToString(DateFormat.EightCharacter);
            var remoteRawData = new FileInfo(Path.Combine(RawSource, $"{formattedDate}.nc.tar.gz"));

            if (!remoteRawData.Exists)
            {
                if (Holidays.Contains(processingDate) || USHoliday.Dates.Contains(processingDate))
                {
                    Log.Trace("SECDataConverter.Process(): File is missing, but we expected it to be missing. Nothing to do.");
                    return;
                }
                throw new Exception($"SECDataConverter.Process(): Raw data {remoteRawData} not found. No processing can be done.");
            }

            // Copy the raw data to a temp path on disk
            Log.Trace($"SECDataConverter.Process(): Copying raw data locally...");
            var tempPath     = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
            var localRawData = remoteRawData.CopyTo(tempPath);

            Log.Trace($"SECDataConverter.Process(): Copied raw data from {remoteRawData.FullName} - to: {tempPath}");

            Log.Trace($"SECDataConverter.Process(): Start processing...");

            var mapFileResolver = MapFileResolver.Create(Globals.DataFolder, Market.USA);

            var ncFilesRead      = 0;
            var startingTime     = DateTime.Now;
            var loopStartingTime = startingTime;

            // For the meantime, let's only process .nc files, and deal with correction files later.
            Parallel.ForEach(
                Compression.UnTar(localRawData.OpenRead(), isTarGz: true).Where(kvp => kvp.Key.EndsWith(".nc")),
                new ParallelOptions {
                MaxDegreeOfParallelism = Environment.ProcessorCount / 2
            },
                rawReportFilePath =>
            {
                var factory = new SECReportFactory();
                var xmlText = new StringBuilder();

                // We need to escape any nested XML to ensure our deserialization happens smoothly
                var parsingText = false;

                // SEC data is line separated by UNIX style line endings. No need to worry about a carriage line here.
                foreach (var line in Encoding.UTF8.GetString(rawReportFilePath.Value).Split('\n'))
                {
                    var newTextLine    = line;
                    var currentTagName = GetTagNameFromLine(newTextLine);

                    // This tag is present rarely in SEC reports, but is unclosed without value when encountered.
                    // Verified by searching with ripgrep for "CONFIRMING-COPY"
                    //
                    // Sometimes, ASSIGNED-SIC contains no value and causes errors. Check to make sure that when
                    // we encounter that tag we check if it has a value.
                    //
                    // "Appearance of the <FLAWED> tag  in
                    //  an EX-27  document header signals unreliable tagging within  the
                    //  following  document text stream; however, in  the absence  of a
                    //  <FLAWED>  tag, tagging is still not guaranteed to  be complete
                    //  because of  allowance in the financial data specifications  for
                    //  omitted tags when the submission also includes a financial  data
                    //  schedule  of article type CT."
                    if (currentTagName == "CONFIRMING-COPY" || (currentTagName == "ASSIGNED-SIC" && !HasValue(line)) || currentTagName == "FLAWED")
                    {
                        continue;
                    }

                    // Indicates that the form is a paper submission and that the current file has no contents
                    if (currentTagName == "PAPER")
                    {
                        continue;
                    }

                    // Don't encode the closing tag
                    if (currentTagName == "/TEXT")
                    {
                        parsingText = false;
                    }

                    // To ensure that we can serialize/deserialize data with hours, minutes, seconds
                    if (currentTagName == "FILING-DATE" || currentTagName == "PERIOD" ||
                        currentTagName == "DATE-OF-FILING-CHANGE" || currentTagName == "DATE-CHANGED")
                    {
                        newTextLine = $"{newTextLine.TrimEnd()} 00:00:00";
                    }

                    // Encode all contents inside tags to prevent errors in XML parsing.
                    // The json deserializer will convert these values back to their original form
                    if (!parsingText && HasValue(newTextLine))
                    {
                        newTextLine =
                            $"<{currentTagName}>{SecurityElement.Escape(GetTagValueFromLine(newTextLine))}</{currentTagName}>";
                    }
                    // Escape all contents inside TEXT tags
                    else if (parsingText)
                    {
                        newTextLine = SecurityElement.Escape(newTextLine);
                    }

                    // Don't encode the opening tag
                    if (currentTagName == "TEXT")
                    {
                        parsingText = true;
                    }

                    xmlText.AppendLine(newTextLine);
                }

                var counter = Interlocked.Increment(ref ncFilesRead);
                if (counter % 100 == 0)
                {
                    var interval = DateTime.Now - loopStartingTime;
                    Log.Trace($"SECDataConverter.Process(): {counter} nc files read at {100 / interval.TotalMinutes:N2} files/min.");
                    loopStartingTime = DateTime.Now;
                }

                ISECReport report;
                try
                {
                    report = factory.CreateSECReport(xmlText.ToString());
                }
                // Ignore unsupported form types for now
                catch (DataException)
                {
                    return;
                }
                catch (XmlException e)
                {
                    Log.Error(e, $"SECDataConverter.Process(): Failed to parse XML from file: {rawReportFilePath.Key}");
                    return;
                }
                catch (Exception e)
                {
                    Log.Error(e, "SECDataConverter.Process(): Unknown error encountered");
                    return;
                }

                // First filer listed in SEC report is usually the company listed on stock exchanges
                var companyCik = report.Report.Filers.First().CompanyData.Cik;

                // Some companies can operate under two tickers, but have the same CIK.
                // Don't bother continuing if we don't find any tickers for the given CIK
                List <string> tickers;
                if (!CikTicker.TryGetValue(companyCik, out tickers))
                {
                    return;
                }

                if (!File.Exists(Path.Combine(RawSource, "indexes", $"{companyCik}.json")))
                {
                    Log.Error($"SECDataConverter.Process(): {report.Report.FilingDate:yyyy-MM-dd}:{rawReportFilePath.Key} - Failed to find index file for ticker {tickers.FirstOrDefault()} with CIK: {companyCik}");
                    return;
                }

                try
                {
                    // The index file can potentially be corrupted
                    GetPublicationDate(report, companyCik);
                }
                catch (Exception e)
                {
                    Log.Error(e, $"SECDataConverter.Process(): {report.Report.FilingDate:yyyy-MM-dd}:{rawReportFilePath.Key} - Index file loading failed for ticker: {tickers.FirstOrDefault()} with CIK: {companyCik} even though it exists");
                }

                // Default to company CIK if no known ticker is found.
                // If the equity is not does not resolve to a map file or
                // it is not found in the map files, we skip writing it.
                foreach (var ticker in tickers)
                {
                    var tickerMapFile = mapFileResolver.ResolveMapFile(ticker, processingDate);
                    if (!tickerMapFile.Any())
                    {
                        Log.Trace($"SECDataConverter.Process(): {processingDate} - Failed to find map file for ticker: {ticker}");
                        continue;
                    }

                    // Map the current ticker to the ticker it was in the past using the map file system
                    var mappedTicker = tickerMapFile.GetMappedSymbol(processingDate);

                    // If no suitable date is found for the symbol in the map file, we skip writing the data
                    if (string.IsNullOrEmpty(mappedTicker))
                    {
                        Log.Trace($"SECDataConverter.Process(): {processingDate} - Failed to find mapped symbol for ticker: {ticker}");
                        continue;
                    }

                    var tickerReports = Reports.GetOrAdd(
                        mappedTicker,
                        _ => new ConcurrentDictionary <DateTime, List <ISECReport> >()
                        );
                    var reports = tickerReports.GetOrAdd(
                        report.Report.FilingDate.Date,
                        _ => new List <ISECReport>()
                        );

                    reports.Add(report);
                }
            }
                );

            Log.Trace($"SECDataConverter.Process(): {ncFilesRead} nc files read finished in {DateTime.Now - startingTime:g}.");

            Parallel.ForEach(
                Reports.Keys,
                ticker =>
            {
                List <ISECReport> reports;
                if (!Reports[ticker].TryRemove(processingDate, out reports))
                {
                    return;
                }

                WriteReport(reports, ticker);
            }
                );

            // Delete the raw data we copied to the temp folder
            File.Delete(tempPath);
        }