static Open311Data CleanOpen311(Open311Data record) { if (string.IsNullOrEmpty(record.Text)) { return(record); } var text = record.Text; text = text.Replace(Environment.NewLine, " "); text = text.Replace("\n", " "); text = text.Replace("\r", " "); text = text.Replace(" ", " "); text = text.Replace("\"", string.Empty); record.Text = text; return(record); }
static IEnumerable <Open311Data> OpenFile(string path, int expectedTokenCount, int codeIndex, int nameIndex, int textIndex) { //var standardizer = new StopwordsStandardizer(@"german_stopwords_full.txt"); //var standardizer = new SynonymStandardizer(); var serviceTypes = new Open311ServiceTypes(); var unknownTypes = new HashSet <string>(); Open311Data lastRecord = null; using (var reader = new StreamReader(path)) { var header = reader.ReadLine(); string line; const char Delimiter = '\t'; while (null != (line = reader.ReadLine())) { var tokens = line.Split(Delimiter); var tokenCount = tokens.Length; if (expectedTokenCount == tokenCount) { // Return the last record if (null != lastRecord) { lastRecord = CleanOpen311(lastRecord); if (!string.IsNullOrEmpty(lastRecord.Text) && !string.IsNullOrEmpty(lastRecord.Name)) { yield return(lastRecord); } } var record = new Open311Data(); var serviceType = tokens[codeIndex]; if (float.TryParse(serviceType, out float code)) { // Validate the service type if (serviceTypes.IsKnownServiceType(code)) { record.Code = code; record.Name = tokens[nameIndex]; var userRequest = tokens[textIndex]; var text = userRequest; //var text = standardizer.Standardize(userRequest); record.Text = text; // Set the current record lastRecord = record; } else { unknownTypes.Add(serviceType); } } } else if (null != lastRecord) { // Append the whole line to the last record lastRecord.Text += line; } } } // Return the last record if (null != lastRecord) { lastRecord = CleanOpen311(lastRecord); if (!string.IsNullOrEmpty(lastRecord.Text) && !string.IsNullOrEmpty(lastRecord.Name)) { yield return(lastRecord); } } if (0 < unknownTypes.Count) { Console.WriteLine($"{unknownTypes.Count} unknown service types!"); } }