protected int Convert_ScoreUncategorized(OleDbConnection input, SQLiteConnection test, SQLiteConnection train, Parameters parameters, string noteColumnName) { var totalCount = 0; var testCount = 0; var trainPositiveCount = 0; var trainNegativeCount = 0; var excludedCount = 0; var cmdText = "SELECT ED_ENC_NUM, " + noteColumnName + ", Category FROM DOCUMENTS"; /////////////////////////////////////////////////////////////////////////////// var positiveCategories = new HashSet<int>(parameters.positiveCategories); var excludedCategories = new HashSet<int>(parameters.excludedCategories); var command = new OleDbCommand(cmdText, input); using (var reader = command.ExecuteReader()) { if (reader.HasRows) { var categoryColumnType = GetCategoryColumnType(reader); while (reader.Read()) { var row = new Documents { ED_ENC_NUM = reader.GetDouble(0), NOTE_TEXT = reader.GetString(1), Category = GetInt32ValueInvariant(reader, 2, categoryColumnType) }; /////////////////////////////////////////////////////////////////////////////// if (row.Category != null) { if (!excludedCategories.Contains(row.Category.Value)) { if (positiveCategories.Contains(row.Category.Value)) { row.Score = 100; trainPositiveCount++; } else { row.Score = -100; trainNegativeCount++; } train.Insert(row); } else excludedCount++; } else { row.Score = 0; test.Insert(row); testCount++; } /////////////////////////////////////////////////////////////////////////////// totalCount++; } } } /////////////////////////////////////////////////////////////////////////////// _logger.Log("Excluded documents count: " + excludedCount); _logger.Log("TEST documents count: " + testCount); _logger.Log("TRAIN positive documents count: " + trainPositiveCount); _logger.Log("TRAIN negative documents count: " + trainNegativeCount); /////////////////////////////////////////////////////////////////////////////// AssertDocumentsCount(test, train); /////////////////////////////////////////////////////////////////////////////// return totalCount; }
public void Convert(string inputFilePath, Parameters parameters) { if (parameters.dynamicPositiveCategories == null) { parameters.dynamicPositiveCategories = new string[] { } } ; if (parameters.dynamicExcludedCategories == null) { parameters.dynamicExcludedCategories = new string[] { } } ; bool invalidPassword; using (var connection = new OleDbConnection(ConnectionStringHelper.GetConnectionString(inputFilePath, parameters.password, out invalidPassword))) { connection.Open(); var cmd = connection.CreateCommand(); cmd.CommandText = "SELECT ID, Title FROM DynamicColumnCategories WHERE DynamicColumnID = @ID"; cmd.Parameters.Clear(); cmd.Parameters.AddWithValue("@ID", parameters.dynamicColumnID); var categoriesTable = new DataTable(); var adapter = new OleDbDataAdapter(cmd); adapter.Fill(categoriesTable); cmd.Parameters.Clear(); cmd.Parameters.AddWithValue("@categoryID", 0); cmd.Parameters.AddWithValue("@categoryTitle", ""); var totalRowsUpdated = 0; var positiveCategories = new List <int>(); var excludedCategories = new List <int>(); var cmdClearCategoryColumn = connection.CreateCommand(); cmdClearCategoryColumn.CommandText = "UPDATE Documents SET Category = NULL"; cmdClearCategoryColumn.ExecuteNonQuery(); foreach (var row in categoriesTable.Rows.Cast <DataRow>()) { var categoryID = (int)row[0]; var categoryTitle = (string)row[1]; if (parameters.dynamicPositiveCategories.Any(x => x == categoryTitle)) { positiveCategories.Add(categoryID); } else if (parameters.dynamicExcludedCategories.Any(x => x == categoryTitle)) { excludedCategories.Add(categoryID); } /////////////////////////////////////////////////////////////////////////////// cmd.CommandText = "UPDATE Documents SET Category = @categoryID WHERE [" + parameters.dynamicColumnTitle + "] = @categoryTitle"; cmd.Parameters[0].Value = categoryID; cmd.Parameters[1].Value = categoryTitle; totalRowsUpdated += cmd.ExecuteNonQuery(); } /////////////////////////////////////////////////////////////////////////////// parameters.positiveCategories = positiveCategories.ToArray(); parameters.excludedCategories = excludedCategories.ToArray(); _logger.Log("Categories converted for " + totalRowsUpdated + " document(s)"); } } } }
protected int Convert_Divide(OleDbConnection input, SQLiteConnection test, SQLiteConnection train, Parameters parameters, string noteColumnName) { if (parameters.positiveCategories == null || parameters.positiveCategories.Length == 0) throw new ArgumentException("No positive categories"); /////////////////////////////////////////////////////////////////////////////// var totalCount = 0; var excludedCount = 0; /////////////////////////////////////////////////////////////////////////////// string cmdText = "SELECT ED_ENC_NUM, " + noteColumnName + ", Category FROM DOCUMENTS"; /////////////////////////////////////////////////////////////////////////////// var positiveCategories = new HashSet<int>(parameters.positiveCategories); var excludedCategories = new HashSet<int>(parameters.excludedCategories); var positiveDocuments = new List<Documents>(); var negativeDocuments = new List<Documents>(); var command = new OleDbCommand(cmdText, input); var random = new Random(DateTime.Now.Millisecond); using (var reader = command.ExecuteReader()) { if (reader.HasRows) { var categoryColumnType = GetCategoryColumnType(reader); while (reader.Read()) { var row = new Documents { ED_ENC_NUM = reader.GetDouble(0), NOTE_TEXT = reader.IsDBNull(1) ? "" : reader.GetString(1), Category = GetInt32ValueInvariant(reader, 2, categoryColumnType) }; /////////////////////////////////////////////////////////////////////////////// if (!excludedCategories.Contains(row.Category.Value)) { if (positiveCategories.Contains(row.Category.Value)) InsertRandom(positiveDocuments, row, random); else InsertRandom(negativeDocuments, row, random); } else excludedCount++; /////////////////////////////////////////////////////////////////////////////// totalCount++; } } } /////////////////////////////////////////////////////////////////////////////// var takePositiveDocsCount = (int) ((positiveDocuments.Count / 100d) * parameters.validationPercentage); var takeNegativeDocsCount = (int) ((negativeDocuments.Count / 100d) * parameters.validationPercentage); foreach (var row in positiveDocuments.Take(takePositiveDocsCount)) { test.Insert(row); } foreach (var row in negativeDocuments.Take(takeNegativeDocsCount)) { test.Insert(row); } /////////////////////////////////////////////////////////////////////////////// foreach (var row in positiveDocuments.Skip(takePositiveDocsCount)) { row.Score = 100; train.Insert(row); } foreach (var row in negativeDocuments.Skip(takeNegativeDocsCount)) { row.Score = -100; train.Insert(row); } /////////////////////////////////////////////////////////////////////////////// _logger.Log("Excluded documents count: " + excludedCount); _logger.Log("TEST positive documents count: " + takePositiveDocsCount); _logger.Log("TEST negative documents count: " + takeNegativeDocsCount); _logger.Log("TRAIN positive documents count: " + (positiveDocuments.Count - takePositiveDocsCount)); _logger.Log("TRAIN negative documents count: " + (negativeDocuments.Count - takeNegativeDocsCount)); /////////////////////////////////////////////////////////////////////////////// AssertDocumentsCount(test, train); /////////////////////////////////////////////////////////////////////////////// return totalCount; }