public IEnumerable <Tuple <string, string> > ParseAndGetTextes(IEnumerable <string> files) { int fileIndex = 1; ParsingText parcer = new ParsingText(); HtmlParser htmlParser = new HtmlParser(); foreach (string path in files) { using (var fileStream = new FileStream(path, FileMode.Open)) { using (var gZipStream = new GZipStream(fileStream, CompressionMode.Decompress)) { using (MemoryStream buffStrm = new MemoryStream()) { gZipStream.CopyTo(buffStrm); buffStrm.Position = 0; using (IHtmlDocument document = htmlParser.Parse(buffStrm)) { var elements = document.All.OfType <IText>(); string text = elements.Any() ? string.Join(Environment.NewLine, elements.Select(x => x.Text)) : document.All.FirstOrDefault(x => x.LocalName == "text")?.TextContent; if (document.All.Where(x => x.LocalName == "type").Any(x => x.TextContent.StartsWith("GRAPHIC"))) { continue; } //string trimText = Regex.Replace(text.ToUpperInvariant(), @"\s+", " "); text = parcer.TrimForParsing(text); if (fileIndex % 100 == 0) { FilesDescription = $"Loading files[{fileIndex}]..."; } fileIndex++; if (!string.IsNullOrEmpty(text)) { yield return(new Tuple <string, string>(path, text)); } } } } } } }
private void LoadFiles() { try { var sw = Stopwatch.StartNew(); ParsingText parcer = new ParsingText(); //var fileInfos = ParseAndGetTextes(GetFiles()). var fileInfos = GetTextes(GetFiles()). Where(x => !string.IsNullOrEmpty(x.Item2)). Select(x => new { fileName = x.Item1, text = parcer.TrimForParsing(x.Item2) }). //Take(2000). SelectMany(x => parcer.ParseBySearch(x.text).Select(parsedResult => new { x.fileName, x.text, parsedResult })). ToList(); sw.Stop(); TimeSpan timeParsing = sw.Elapsed; var parcedNamePerson = fileInfos. Where(x => !string.IsNullOrEmpty(x.text) && !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix)). ToList(); var notParcedNamePerson = fileInfos. Where(x => !string.IsNullOrEmpty(x.text) && string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix)). ToList(); var parcedAggregatedAmount = fileInfos. Where(x => !string.IsNullOrEmpty(x.text) && !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix) && !string.IsNullOrEmpty(x.parsedResult.AggregatedAmountPrefix)). ToList(); var notParcedAggregatedAmount = fileInfos. Where(x => !string.IsNullOrEmpty(x.text) && !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix) && string.IsNullOrEmpty(x.parsedResult.AggregatedAmountPrefix)). ToList(); var parcedPercentOwned = fileInfos. Where(x => !string.IsNullOrEmpty(x.text) && !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix) && !string.IsNullOrEmpty(x.parsedResult.PercentOwnedPrefix)). ToList(); var notParcedPercentOwned = fileInfos. Where(x => !string.IsNullOrEmpty(x.text) && !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix) && string.IsNullOrEmpty(x.parsedResult.PercentOwnedPrefix)). ToList(); var matches = parcedAggregatedAmount. //GroupBy(x => x.parsedResult.NamePersonValue). //Select(x => x.First()). Select(x => new { textNamePerson = $"{x.parsedResult.NamePersonPrefix}[{x.parsedResult.NamePerson}]{x.parsedResult.NamePersonPostfix}", textAggregatedAmount = $"{x.parsedResult.AggregatedAmountPrefix}[{x.parsedResult.AggregatedAmount}]{x.parsedResult.AggregatedAmountPostfix}", textPercentOwned = $"{x.parsedResult.PercentOwnedPrefix}[{x.parsedResult.PercentOwned}]{x.parsedResult.PercentOwnedPostfix}", valueNamePerson = x.parsedResult.NamePersonValue, valueAggregatedAmount = x.parsedResult.AggregatedAmountValue, valuePercentOwned = x.parsedResult.PercentOwnedValue, fileName = x.fileName }). Select(x => new { toWrite = $"{x.textNamePerson}\t{x.valueNamePerson}\t{x.valueNamePerson.Length}\t{x.textAggregatedAmount}\t{x.valueAggregatedAmount}\t{x.textPercentOwned}\t{x.valuePercentOwned}\t{x.fileName}" }). ToArray(); File.WriteAllLines(FileManager.FileResults, matches.Select(x => x.toWrite)); /* * var notParced = fileInfos. * Where(x => * !string.IsNullOrEmpty(x.text) && * string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix)). * ToList(); */ /* * notParced = notParced. * Select(x => new { x.fileName, x.text, parsedResult = parcer.ParseTable(x.text) }). * Where(x => * !string.IsNullOrEmpty(x.text) && * !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix)). * ToList(); */ /* * textes = notParcedNamePerson.Select(x => x.text).ToList(); * files = notParcedNamePerson.Select(x => x.fileName).ToList(); * parsedRegions = notParcedNamePerson.Select(x => x.text).ToList(); * dataSets = notParcedNamePerson.Select(x => parcer.TrimForClustering(x.text)).ToList(); */ /* * textes = notParcedAggregatedAmount.Select(x => x.text).ToList(); * files = notParcedAggregatedAmount.Select(x => x.fileName).ToList(); * parsedRegions = notParcedAggregatedAmount.Select(x => x.parsedResult.Region).ToList(); * dataSets = notParcedAggregatedAmount.Select(x => parcer.TrimForClustering(x.parsedResult.Region)).ToList(); */ //* textes = notParcedPercentOwned.Select(x => x.text).ToList(); files = notParcedPercentOwned.Select(x => x.fileName).ToList(); parsedRegions = notParcedPercentOwned.Select(x => x.parsedResult.Region).ToList(); dataSets = notParcedPercentOwned.Select(x => parcer.TrimForClustering(x.parsedResult.Region)).ToList(); //*/ FilesDescription = $"All files: {fileInfos.Count}; Not parced files: Name Person: {notParcedNamePerson.Count}, Amount: {notParcedAggregatedAmount.Count}, Percent: {notParcedPercentOwned.Count}, Time: {timeParsing:mm\\:ss}"; MessageBox.Show("Press OK", "Confirmation"); } catch (Exception e) { Debug.Fail("LoadFiles"); Console.WriteLine(e); throw; } }