Beispiel #1
0
		protected int Convert_ScoreUncategorized(OleDbConnection input, SQLiteConnection test, SQLiteConnection train, Parameters parameters, string noteColumnName)
		{
			var totalCount = 0;

			var testCount = 0;
			var trainPositiveCount = 0;
			var trainNegativeCount = 0;
			var excludedCount = 0;

			var cmdText = "SELECT ED_ENC_NUM, " + noteColumnName + ", Category FROM DOCUMENTS";

			///////////////////////////////////////////////////////////////////////////////

			var positiveCategories = new HashSet<int>(parameters.positiveCategories);
			var excludedCategories = new HashSet<int>(parameters.excludedCategories);

			var command = new OleDbCommand(cmdText, input);

			using (var reader = command.ExecuteReader())
			{
				if (reader.HasRows)
				{
					var categoryColumnType = GetCategoryColumnType(reader);

					while (reader.Read())
					{
						var row = new Documents
								  {
									  ED_ENC_NUM = reader.GetDouble(0),
									  NOTE_TEXT = reader.GetString(1),
									  Category = GetInt32ValueInvariant(reader, 2, categoryColumnType)
								  };


						///////////////////////////////////////////////////////////////////////////////

						if (row.Category != null)
						{
							if (!excludedCategories.Contains(row.Category.Value))
							{
								if (positiveCategories.Contains(row.Category.Value))
								{
									row.Score = 100;

									trainPositiveCount++;
								}
								else
								{
									row.Score = -100;

									trainNegativeCount++;
								}

								train.Insert(row);
							}
							else
								excludedCount++;
						}
						else
						{
							row.Score = 0;
							test.Insert(row);

							testCount++;
						}

						///////////////////////////////////////////////////////////////////////////////

						totalCount++;
					}
				}
			}

			///////////////////////////////////////////////////////////////////////////////

			_logger.Log("Excluded documents count: " + excludedCount);

			_logger.Log("TEST documents count: " + testCount);

			_logger.Log("TRAIN positive documents count: " + trainPositiveCount);
			_logger.Log("TRAIN negative documents count: " + trainNegativeCount);

			///////////////////////////////////////////////////////////////////////////////

			AssertDocumentsCount(test, train);

			///////////////////////////////////////////////////////////////////////////////

			return totalCount;
		}
Beispiel #2
0
        public void Convert(string inputFilePath, Parameters parameters)
        {
            if (parameters.dynamicPositiveCategories == null)
            {
                parameters.dynamicPositiveCategories = new string[] { }
            }
            ;

            if (parameters.dynamicExcludedCategories == null)
            {
                parameters.dynamicExcludedCategories = new string[] { }
            }
            ;

            bool invalidPassword;

            using (var connection = new OleDbConnection(ConnectionStringHelper.GetConnectionString(inputFilePath, parameters.password, out invalidPassword)))
            {
                connection.Open();

                var cmd = connection.CreateCommand();
                cmd.CommandText = "SELECT ID, Title FROM DynamicColumnCategories WHERE DynamicColumnID = @ID";
                cmd.Parameters.Clear();
                cmd.Parameters.AddWithValue("@ID", parameters.dynamicColumnID);

                var categoriesTable = new DataTable();
                var adapter         = new OleDbDataAdapter(cmd);
                adapter.Fill(categoriesTable);

                cmd.Parameters.Clear();
                cmd.Parameters.AddWithValue("@categoryID", 0);
                cmd.Parameters.AddWithValue("@categoryTitle", "");

                var totalRowsUpdated = 0;

                var positiveCategories = new List <int>();
                var excludedCategories = new List <int>();

                var cmdClearCategoryColumn = connection.CreateCommand();
                cmdClearCategoryColumn.CommandText = "UPDATE Documents SET Category = NULL";
                cmdClearCategoryColumn.ExecuteNonQuery();

                foreach (var row in categoriesTable.Rows.Cast <DataRow>())
                {
                    var categoryID    = (int)row[0];
                    var categoryTitle = (string)row[1];

                    if (parameters.dynamicPositiveCategories.Any(x => x == categoryTitle))
                    {
                        positiveCategories.Add(categoryID);
                    }
                    else if (parameters.dynamicExcludedCategories.Any(x => x == categoryTitle))
                    {
                        excludedCategories.Add(categoryID);
                    }

                    ///////////////////////////////////////////////////////////////////////////////

                    cmd.CommandText         = "UPDATE Documents SET Category = @categoryID WHERE [" + parameters.dynamicColumnTitle + "] = @categoryTitle";
                    cmd.Parameters[0].Value = categoryID;
                    cmd.Parameters[1].Value = categoryTitle;

                    totalRowsUpdated += cmd.ExecuteNonQuery();
                }

                ///////////////////////////////////////////////////////////////////////////////

                parameters.positiveCategories = positiveCategories.ToArray();
                parameters.excludedCategories = excludedCategories.ToArray();

                _logger.Log("Categories converted for " + totalRowsUpdated + " document(s)");
            }
        }
    }
}
Beispiel #3
0
		protected int Convert_Divide(OleDbConnection input, SQLiteConnection test, SQLiteConnection train, Parameters parameters, string noteColumnName)
		{
			if (parameters.positiveCategories == null || parameters.positiveCategories.Length == 0)
				throw new ArgumentException("No positive categories");

			///////////////////////////////////////////////////////////////////////////////

			var totalCount = 0;
			var excludedCount = 0;

			///////////////////////////////////////////////////////////////////////////////

			string cmdText = "SELECT ED_ENC_NUM, " + noteColumnName + ", Category FROM DOCUMENTS";

			///////////////////////////////////////////////////////////////////////////////

			var positiveCategories = new HashSet<int>(parameters.positiveCategories);
			var excludedCategories = new HashSet<int>(parameters.excludedCategories);

			var positiveDocuments = new List<Documents>();
			var negativeDocuments = new List<Documents>();

			var command = new OleDbCommand(cmdText, input);

			var random = new Random(DateTime.Now.Millisecond);

			using (var reader = command.ExecuteReader())
			{
				if (reader.HasRows)
				{
					var categoryColumnType = GetCategoryColumnType(reader);

					while (reader.Read())
					{
						var row = new Documents
								  {
									  ED_ENC_NUM = reader.GetDouble(0),
									  NOTE_TEXT = reader.IsDBNull(1) ? "" : reader.GetString(1),
									  Category = GetInt32ValueInvariant(reader, 2, categoryColumnType)
								  };


						///////////////////////////////////////////////////////////////////////////////

						if (!excludedCategories.Contains(row.Category.Value))
						{
							if (positiveCategories.Contains(row.Category.Value))
								InsertRandom(positiveDocuments, row, random);
							else
								InsertRandom(negativeDocuments, row, random);
						}
						else
							excludedCount++;

						///////////////////////////////////////////////////////////////////////////////

						totalCount++;
					}
				}
			}

			///////////////////////////////////////////////////////////////////////////////

			var takePositiveDocsCount = (int) ((positiveDocuments.Count / 100d) * parameters.validationPercentage);
			var takeNegativeDocsCount = (int) ((negativeDocuments.Count / 100d) * parameters.validationPercentage);

			foreach (var row in positiveDocuments.Take(takePositiveDocsCount))
			{
				test.Insert(row);
			}

			foreach (var row in negativeDocuments.Take(takeNegativeDocsCount))
			{
				test.Insert(row);
			}

			///////////////////////////////////////////////////////////////////////////////

			foreach (var row in positiveDocuments.Skip(takePositiveDocsCount))
			{
				row.Score = 100;

				train.Insert(row);
			}

			foreach (var row in negativeDocuments.Skip(takeNegativeDocsCount))
			{
				row.Score = -100;

				train.Insert(row);
			}

			///////////////////////////////////////////////////////////////////////////////

			_logger.Log("Excluded documents count: " + excludedCount);

			_logger.Log("TEST positive documents count: " + takePositiveDocsCount);
			_logger.Log("TEST negative documents count: " + takeNegativeDocsCount);

			_logger.Log("TRAIN positive documents count: " + (positiveDocuments.Count - takePositiveDocsCount));
			_logger.Log("TRAIN negative documents count: " + (negativeDocuments.Count - takeNegativeDocsCount));

			///////////////////////////////////////////////////////////////////////////////

			AssertDocumentsCount(test, train);

			///////////////////////////////////////////////////////////////////////////////

			return totalCount;
		}