コード例 #1
0
        private void Run()
        {
            int          counter = 0;
            string       line;
            StreamReader instances   = new StreamReader(BaseUrl + "instances.jsonl");
            StreamWriter csvWriter   = new StreamWriter(BaseUrl + "training.csv");
            StreamReader truthreader = new StreamReader(BaseUrl + "truth.jsonl");
            var          csv         = new CsvWriter(csvWriter);

            csv.WriteHeader(typeof(CsvItem));
            csv.Flush();
            csvWriter.WriteLine();

            List <Item>  items  = new List <Item>();
            List <Truth> truths = new List <Truth>();


            while ((line = instances.ReadLine()) != null)
            {
                items.Add(JsonConvert.DeserializeObject <Item>(line));
                truths.Add(JsonConvert.DeserializeObject <Truth>(truthreader.ReadLine()));


                //if (t.Id != item.Id)
                //{
                //    throw new Exception($"No longer synchronous at {counter}, {t.Id}, {item.Id}");
                //}
            }
            csv.Flush();

            instances.Close();
            foreach (var item in items)
            {
                Truth t = truths.First(i => i.Id == item.Id);
                truths.Remove(t);


                csv.WriteRecord(Convert(item, t));
                csv.NextRecord();

                if (t.TruthMean > 0.5)
                {
                    Clickbait++;
                    AddWordTo(clickBaitWords, item.TargetTitle);
                }
                else
                {
                    NonClickBait++;
                    AddWordTo(nonClickBaitWords, item.TargetTitle);
                }

                if (counter % 500 == 0)
                {
                    csv.Flush();
                }

                counter++;
            }
            csvWriter.Close();
        }
コード例 #2
0
        static CsvItem Convert(Item item, Truth truth)
        {
            var wordsPerParagraph = item.TargetParagraphs.Select(i => i.Count(j => j == ' ') + 1).ToList();
            var charsPerParagraph = item.TargetParagraphs.Select(i => i.Count()).ToList();

            var res = new CsvItem
            {
                NumberOfQuestionMarksInTitle = item.TargetTitle.Count(i => i == '?'),
                TitleLength                = item.TargetTitle.Count(),
                NumberOfWordsInTitle       = item.TargetTitle.Count(i => i == ' ') + 1,
                NumberOfCharsInTitle       = item.TargetTitle.Count(),
                NumberOfWordsInArticle     = wordsPerParagraph.Sum(),
                NumberOfCharsInArticle     = charsPerParagraph.Sum(),
                NumberOfParagraphs         = item.TargetParagraphs.Count,
                NumberOfCharsInDescription = item.TargetDescription.Count(),
                NumberOfWordsInDescription = item.TargetDescription.Count(i => i == ' ') + 1,
                ClickBaitMeanRating        = truth.TruthMean,
                ClickBaitMedianRating      = truth.TruthMedian,
                HasImage = item.PostMedia.Count
            };

            res.AverageWordLengthTitle       = (double)res.NumberOfCharsInTitle / res.NumberOfWordsInTitle;
            res.AverageWordLengthArticle     = (double)res.NumberOfCharsInArticle / res.NumberOfWordsInArticle;
            res.AverageWordLengthDescription = (double)res.NumberOfCharsInDescription / res.NumberOfWordsInDescription;

            try
            {
                var hour = DateTime.ParseExact(item.PostTimestamp.Replace("+0000 ", ""), "ddd MMM dd HH:mm:ss yyyy", CultureInfo.InvariantCulture).Hour;
                if (hour > 6 && hour <= 8)
                {
                    res.EarlyMorning = 1;
                }
                else if (hour > 8 && hour <= 12)
                {
                    res.Morning = 1;
                }
                else if (hour > 12 && hour <= 18)
                {
                    res.Afternoon = 1;
                }
                else if (hour > 18 && hour <= 21)
                {
                    res.Evening = 1;
                }
                else
                {
                    res.Night = 1;
                }
            }
            catch (Exception)
            {
                Console.WriteLine("Error at" + item.PostTimestamp);
            }

            return(res);
        }