private void Run() { int counter = 0; string line; StreamReader instances = new StreamReader(BaseUrl + "instances.jsonl"); StreamWriter csvWriter = new StreamWriter(BaseUrl + "training.csv"); StreamReader truthreader = new StreamReader(BaseUrl + "truth.jsonl"); var csv = new CsvWriter(csvWriter); csv.WriteHeader(typeof(CsvItem)); csv.Flush(); csvWriter.WriteLine(); List <Item> items = new List <Item>(); List <Truth> truths = new List <Truth>(); while ((line = instances.ReadLine()) != null) { items.Add(JsonConvert.DeserializeObject <Item>(line)); truths.Add(JsonConvert.DeserializeObject <Truth>(truthreader.ReadLine())); //if (t.Id != item.Id) //{ // throw new Exception($"No longer synchronous at {counter}, {t.Id}, {item.Id}"); //} } csv.Flush(); instances.Close(); foreach (var item in items) { Truth t = truths.First(i => i.Id == item.Id); truths.Remove(t); csv.WriteRecord(Convert(item, t)); csv.NextRecord(); if (t.TruthMean > 0.5) { Clickbait++; AddWordTo(clickBaitWords, item.TargetTitle); } else { NonClickBait++; AddWordTo(nonClickBaitWords, item.TargetTitle); } if (counter % 500 == 0) { csv.Flush(); } counter++; } csvWriter.Close(); }
static CsvItem Convert(Item item, Truth truth) { var wordsPerParagraph = item.TargetParagraphs.Select(i => i.Count(j => j == ' ') + 1).ToList(); var charsPerParagraph = item.TargetParagraphs.Select(i => i.Count()).ToList(); var res = new CsvItem { NumberOfQuestionMarksInTitle = item.TargetTitle.Count(i => i == '?'), TitleLength = item.TargetTitle.Count(), NumberOfWordsInTitle = item.TargetTitle.Count(i => i == ' ') + 1, NumberOfCharsInTitle = item.TargetTitle.Count(), NumberOfWordsInArticle = wordsPerParagraph.Sum(), NumberOfCharsInArticle = charsPerParagraph.Sum(), NumberOfParagraphs = item.TargetParagraphs.Count, NumberOfCharsInDescription = item.TargetDescription.Count(), NumberOfWordsInDescription = item.TargetDescription.Count(i => i == ' ') + 1, ClickBaitMeanRating = truth.TruthMean, ClickBaitMedianRating = truth.TruthMedian, HasImage = item.PostMedia.Count }; res.AverageWordLengthTitle = (double)res.NumberOfCharsInTitle / res.NumberOfWordsInTitle; res.AverageWordLengthArticle = (double)res.NumberOfCharsInArticle / res.NumberOfWordsInArticle; res.AverageWordLengthDescription = (double)res.NumberOfCharsInDescription / res.NumberOfWordsInDescription; try { var hour = DateTime.ParseExact(item.PostTimestamp.Replace("+0000 ", ""), "ddd MMM dd HH:mm:ss yyyy", CultureInfo.InvariantCulture).Hour; if (hour > 6 && hour <= 8) { res.EarlyMorning = 1; } else if (hour > 8 && hour <= 12) { res.Morning = 1; } else if (hour > 12 && hour <= 18) { res.Afternoon = 1; } else if (hour > 18 && hour <= 21) { res.Evening = 1; } else { res.Night = 1; } } catch (Exception) { Console.WriteLine("Error at" + item.PostTimestamp); } return(res); }