Пример #1
0
        public async Task <Guid> Train([FromUri] string blobContainerName,
                                       [FromUri] string trainDirectory,
                                       [FromBody] IEnumerable <Annotation> annotations)
        {
            // Check annoations
            var annotationsArr = annotations as Annotation[] ?? annotations.ToArray();

            if (!annotationsArr.Any())
            {
                throw new AggregateException("No annotations provided for learning");
            }

            // Check blobs folder
            var storageContainer = this.storageClient.GetContainerReference(blobContainerName);

            if (!storageContainer.Exists())
            {
                throw new AggregateException($"{blobContainerName} does not exists");
            }

            var blobs = storageContainer.ListBlobs().ToArray();

            var dir = storageContainer.GetDirectoryReference(trainDirectory);

            blobs = dir.ListBlobs().ToArray();

            if (!blobs.Any())
            {
                throw new AggregateException($"No blobs found in blobs container: {blobContainerName}  folder: {trainDirectory}");
            }

            // Read annotations
            var annotatedSamplesContentFetchingTasks = annotationsArr.Select(async annotation =>
            {
                var blobRef = storageContainer.GetBlobReference(annotation.FileName);
                using (var stream = new MemoryStream())
                {
                    await blobRef.DownloadToStreamAsync(stream);
                    stream.Position = 0;
                    using (var sr = new StreamReader(stream))
                    {
                        var content = await sr.ReadToEndAsync();
                        return(new Tuple <string, uint, uint>(content, (uint)annotation.StartOffset, (uint)annotation.EndOffset));
                    }
                }
            });

            var annotatedSampleFileNames = annotationsArr.Select(a => a.FileName).ToHashSet();
            var nonLabeledSampleBlobs    = blobs.Where(b => !annotatedSampleFileNames.Contains(((CloudBlockBlob)b).Name)).Take(NonLabeledSampleSize);

            var nonLabeledSamplesContentFetchingTasks = nonLabeledSampleBlobs.Select(async b =>
            {
                var blob = (CloudBlockBlob)b;
                return(await blob.DownloadTextAsync());
            });

            var examples = await Task.WhenAll(annotatedSamplesContentFetchingTasks);

            var nonLabeledSamples = await Task.WhenAll(nonLabeledSamplesContentFetchingTasks);

            // Train
            var extractor = await StructureExtractor.TrainExtractorAsync(examples, nonLabeledSamples);

            // Save program
            var serializedExtractor = extractor.Serialize();
            var extractorGuid       = Guid.NewGuid();
            var blockBlob           = storageContainer.GetBlockBlobReference($"{programsFolder}/{extractorGuid:D}");
            await blockBlob.UploadTextAsync(serializedExtractor);

            return(extractorGuid);
        }
Пример #2
0
        public override async Task ProcessDataAsync()
        {
            var scrapedSaleAnnounces = new List <TransportSaleAnnounce>();

            while (ItemsCountForStep > scrapedSaleAnnounces.Count)
            {
                try
                {
                    var urlForScrapping = UrlForScrapping + $"&start={ScrappingPage}";
                    var htmlDocument    = await DataLoader.LoadHtmlDocumentAsync(urlForScrapping);

                    var previewOffers = StructureExtractor.GetPreviewOfferStructure(htmlDocument, ScrappingPage).ToList();

                    for (int i = 0; i < previewOffers.Count; i++)
                    {
tryAgain:

                        try
                        {
                            var sourceLink           = SourceLinkParser.GetLink(previewOffers[i], BaseUrl);
                            var detailedHtmlDocument = await DataLoader.LoadHtmlDocumentAsync(sourceLink);

                            var detailedOfferNode = StructureExtractor.GetDetailedOfferStructure(detailedHtmlDocument);

                            var(brandId, modelId) = BrandModelParser.ParseForDetailed(detailedOfferNode);

                            var saleAnnounce = new TransportSaleAnnounce()
                            {
                                SourceLink         = sourceLink,
                                TransmissionTypeId = TransmissionTypeParser.ParseForDetailed(detailedOfferNode),
                                AdNumber           = OfferNumberParser.ParseForDetailed(detailedOfferNode),
                                BodyTypeId         = BodyTypeParser.ParseForDetailed(detailedOfferNode),
                                CityId             = RegionParser.ParseForDetailed(detailedOfferNode).Id,
                                Description        = DescriptionParser.ParseForDetailed(detailedOfferNode),
                                DriveUnitId        = DriveUnitParser.ParseForDetailed(detailedOfferNode),
                                EngineVolumetric   = EngineVolumetricParser.ParseForDetailed(detailedOfferNode),
                                FuelTypeId         = FuelTypeParser.ParseForDetailed(detailedOfferNode),
                                PreviewImageLink   = ImageLinkParser.ParseForPreview(previewOffers[i]),
                                Mileage            = MileageParser.ParseForDetailed(detailedOfferNode),
                                BrandId            = brandId,
                                VehicleTypeId      = VehicleTypeParser.ParseForDetailed(detailedOfferNode),
                                ModelId            = modelId,
                                PriceInDollars     = PriceParser.ParseForDetailed(detailedOfferNode),
                                UpdateOfferTime    = PublishDateParser.ParseForDetailed(detailedOfferNode),
                                Year             = YearParser.ParseForDetailed(detailedOfferNode),
                                CreatedAt        = DateTime.Now,
                                SourceProviderId = (int)SourceProviderEnum.RST
                            };

                            var carConditionIds = CarConditionParser.ParseForPreview(previewOffers[i]);
                            foreach (var carConditionId in carConditionIds)
                            {
                                saleAnnounce.TransportConditions.Add(new TransportConditionInSaleAnnounce()
                                {
                                    TransportConditionId = carConditionId
                                });
                            }

                            scrapedSaleAnnounces.Add(saleAnnounce);

                            UnitOfWork.TransportSaleAnnouncesRepository.Add(saleAnnounce);
                            await UnitOfWork.SaveChangesAsync();

                            ScrappingPage++;
                        }
                        catch (WebException)
                        {
                            goto tryAgain;
                        }
                    }
                }
                catch (Exception ex)
                {
                    _logger.LogError($"{ex.Message}\n{ex.StackTrace}");
                }
            }

            //await UnitOfWork.TransportSaleAnnouncesRepository.InsertRangeSaleAnnounces(scrapedSaleAnnounces);
        }
Пример #3
0
        public async Task <IEnumerable <Annotation> > Extract([FromUri] string blobContainerName,
                                                              [FromUri] string scoreDirectory,
                                                              [FromUri] Guid programId)
        {
            // Check blobs folder
            var storageContainer = this.storageClient.GetContainerReference(blobContainerName);

            if (!storageContainer.Exists())
            {
                throw new AggregateException($"{blobContainerName} does not exists");
            }

            var dir = storageContainer.GetDirectoryReference(scoreDirectory);

            //var blobs = dir.ListBlobs().ToArray();

            var blobFiles = dir.ListBlobs().Where(b => b as CloudBlockBlob != null).ToArray();

            if (!blobFiles.Any())
            {
                throw new AggregateException($"No blobs found in blobs folder {scoreDirectory}");
            }

            var programBlobRef = storageContainer.GetBlobReference($"{programsFolder}/{programId:D}");

            if (!programBlobRef.Exists())
            {
                throw new AggregateException($"Program with id {programId:D} not found");
            }

            // Load program
            string serializedProgram;

            using (var stream = new MemoryStream())
            {
                await programBlobRef.DownloadToStreamAsync(stream);

                stream.Position = 0;
                using (var sr = new StreamReader(stream))
                {
                    serializedProgram = await sr.ReadToEndAsync();
                }
            }

            var extractor = StructureExtractor.Deserialize(serializedProgram);

            // Extraction
            var extractionTasks = blobFiles.Select(async b =>
            {
                var blob         = (CloudBlockBlob)b;
                var content      = await blob.DownloadTextAsync();
                var stringRegion = extractor.Extract(content);

                if (stringRegion == null || string.IsNullOrEmpty(stringRegion.Value))
                {
                    return(new Annotation
                    {
                        FileName = blob.Name
                    });
                }

                return(new Annotation
                {
                    FileName = blob.Name,
                    StartOffset = (int)stringRegion.Start,
                    EndOffset = (int)stringRegion.End
                });
            });

            return(await Task.WhenAll(extractionTasks));
        }