Пример #1
0
        public IEnumerable <Tuple <string, string> > ParseAndGetTextes(IEnumerable <string> files)
        {
            int         fileIndex  = 1;
            ParsingText parcer     = new ParsingText();
            HtmlParser  htmlParser = new HtmlParser();

            foreach (string path in files)
            {
                using (var fileStream = new FileStream(path, FileMode.Open))
                {
                    using (var gZipStream = new GZipStream(fileStream, CompressionMode.Decompress))
                    {
                        using (MemoryStream buffStrm = new MemoryStream())
                        {
                            gZipStream.CopyTo(buffStrm);
                            buffStrm.Position = 0;

                            using (IHtmlDocument document = htmlParser.Parse(buffStrm))
                            {
                                var    elements = document.All.OfType <IText>();
                                string text     = elements.Any()
                                    ? string.Join(Environment.NewLine, elements.Select(x => x.Text))
                                    : document.All.FirstOrDefault(x => x.LocalName == "text")?.TextContent;
                                if (document.All.Where(x => x.LocalName == "type").Any(x => x.TextContent.StartsWith("GRAPHIC")))
                                {
                                    continue;
                                }

                                //string trimText = Regex.Replace(text.ToUpperInvariant(), @"\s+", " ");
                                text = parcer.TrimForParsing(text);

                                if (fileIndex % 100 == 0)
                                {
                                    FilesDescription = $"Loading files[{fileIndex}]...";
                                }
                                fileIndex++;

                                if (!string.IsNullOrEmpty(text))
                                {
                                    yield return(new Tuple <string, string>(path, text));
                                }
                            }
                        }
                    }
                }
            }
        }
Пример #2
0
        private void LoadFiles()
        {
            try
            {
                var sw = Stopwatch.StartNew();

                ParsingText parcer = new ParsingText();
                //var fileInfos = ParseAndGetTextes(GetFiles()).
                var fileInfos = GetTextes(GetFiles()).
                                Where(x => !string.IsNullOrEmpty(x.Item2)).
                                Select(x => new { fileName = x.Item1, text = parcer.TrimForParsing(x.Item2) }).
                                //Take(2000).
                                SelectMany(x => parcer.ParseBySearch(x.text).Select(parsedResult => new { x.fileName, x.text, parsedResult })).
                                ToList();

                sw.Stop();
                TimeSpan timeParsing = sw.Elapsed;


                var parcedNamePerson = fileInfos.
                                       Where(x =>
                                             !string.IsNullOrEmpty(x.text) &&
                                             !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix)).
                                       ToList();

                var notParcedNamePerson = fileInfos.
                                          Where(x =>
                                                !string.IsNullOrEmpty(x.text) &&
                                                string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix)).
                                          ToList();

                var parcedAggregatedAmount = fileInfos.
                                             Where(x =>
                                                   !string.IsNullOrEmpty(x.text) &&
                                                   !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix) &&
                                                   !string.IsNullOrEmpty(x.parsedResult.AggregatedAmountPrefix)).
                                             ToList();

                var notParcedAggregatedAmount = fileInfos.
                                                Where(x =>
                                                      !string.IsNullOrEmpty(x.text) &&
                                                      !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix) &&
                                                      string.IsNullOrEmpty(x.parsedResult.AggregatedAmountPrefix)).
                                                ToList();

                var parcedPercentOwned = fileInfos.
                                         Where(x =>
                                               !string.IsNullOrEmpty(x.text) &&
                                               !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix) &&
                                               !string.IsNullOrEmpty(x.parsedResult.PercentOwnedPrefix)).
                                         ToList();

                var notParcedPercentOwned = fileInfos.
                                            Where(x =>
                                                  !string.IsNullOrEmpty(x.text) &&
                                                  !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix) &&
                                                  string.IsNullOrEmpty(x.parsedResult.PercentOwnedPrefix)).
                                            ToList();

                var matches = parcedAggregatedAmount.
                              //GroupBy(x => x.parsedResult.NamePersonValue).
                              //Select(x => x.First()).
                              Select(x => new
                {
                    textNamePerson        = $"{x.parsedResult.NamePersonPrefix}[{x.parsedResult.NamePerson}]{x.parsedResult.NamePersonPostfix}",
                    textAggregatedAmount  = $"{x.parsedResult.AggregatedAmountPrefix}[{x.parsedResult.AggregatedAmount}]{x.parsedResult.AggregatedAmountPostfix}",
                    textPercentOwned      = $"{x.parsedResult.PercentOwnedPrefix}[{x.parsedResult.PercentOwned}]{x.parsedResult.PercentOwnedPostfix}",
                    valueNamePerson       = x.parsedResult.NamePersonValue,
                    valueAggregatedAmount = x.parsedResult.AggregatedAmountValue,
                    valuePercentOwned     = x.parsedResult.PercentOwnedValue,
                    fileName = x.fileName
                }).
                              Select(x => new
                {
                    toWrite = $"{x.textNamePerson}\t{x.valueNamePerson}\t{x.valueNamePerson.Length}\t{x.textAggregatedAmount}\t{x.valueAggregatedAmount}\t{x.textPercentOwned}\t{x.valuePercentOwned}\t{x.fileName}"
                }).
                              ToArray();

                File.WriteAllLines(FileManager.FileResults, matches.Select(x => x.toWrite));

                /*
                 * var notParced = fileInfos.
                 * Where(x =>
                 *  !string.IsNullOrEmpty(x.text) &&
                 *  string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix)).
                 * ToList();
                 */
                /*
                 * notParced = notParced.
                 * Select(x => new { x.fileName, x.text, parsedResult = parcer.ParseTable(x.text) }).
                 * Where(x =>
                 *  !string.IsNullOrEmpty(x.text) &&
                 *  !string.IsNullOrEmpty(x.parsedResult.NamePersonPrefix)).
                 * ToList();
                 */

                /*
                 * textes = notParcedNamePerson.Select(x => x.text).ToList();
                 * files = notParcedNamePerson.Select(x => x.fileName).ToList();
                 * parsedRegions = notParcedNamePerson.Select(x => x.text).ToList();
                 * dataSets = notParcedNamePerson.Select(x => parcer.TrimForClustering(x.text)).ToList();
                 */
                /*
                 * textes = notParcedAggregatedAmount.Select(x => x.text).ToList();
                 * files = notParcedAggregatedAmount.Select(x => x.fileName).ToList();
                 * parsedRegions = notParcedAggregatedAmount.Select(x => x.parsedResult.Region).ToList();
                 * dataSets = notParcedAggregatedAmount.Select(x => parcer.TrimForClustering(x.parsedResult.Region)).ToList();
                 */
                //*
                textes        = notParcedPercentOwned.Select(x => x.text).ToList();
                files         = notParcedPercentOwned.Select(x => x.fileName).ToList();
                parsedRegions = notParcedPercentOwned.Select(x => x.parsedResult.Region).ToList();
                dataSets      = notParcedPercentOwned.Select(x => parcer.TrimForClustering(x.parsedResult.Region)).ToList();
                //*/
                FilesDescription = $"All files: {fileInfos.Count}; Not parced files: Name Person: {notParcedNamePerson.Count}, Amount: {notParcedAggregatedAmount.Count}, Percent: {notParcedPercentOwned.Count}, Time: {timeParsing:mm\\:ss}";
                MessageBox.Show("Press OK", "Confirmation");
            }
            catch (Exception e)
            {
                Debug.Fail("LoadFiles");
                Console.WriteLine(e);
                throw;
            }
        }