public static void TestAprioriOnAliveSetNoFilter() { Helpers.Utils.Debug("Testing apriori on alive dataset.."); Stopwatch sw = new Stopwatch(); sw.Start(); string mysqlConStr = "SERVER=localhost;" + "DATABASE=uhs;" + "UID=root;" + "PASSWORD=pascal;"; CekaMySQL sql = new CekaMySQL(mysqlConStr); ArffInstance uhsAi = sql.tableToInstance("uhs_patient_story".ToUpper(), new string[] { "INITIAL_PRESENTATION", "INT_1_PRES", "INT_2_PRES", "STATUS" }, -1, -1, false, true); long t = sw.ElapsedMilliseconds; Helpers.Utils.Debug("Gathered MySQL ArffInstance in " + t + "ms."); uhsAi.removeDatasetsPerAttributeValue("STATUS", "Y"); uhsAi.Relation = "uhs_arff_alive"; Helpers.Utils.Debug("Cleaned ArffInstance in " + (sw.ElapsedMilliseconds - t) + "ms."); new Ceka.Algorithms.Associaters.Apriori(uhsAi, 0.2f, 0.1f, true, true, "apriori_result_alive", true); Helpers.Utils.Debug("Apriori test done, took " + sw.ElapsedMilliseconds + "ms."); sw.Stop(); }
public static void CompareSizeOfLoadedInstance(string file = "test") { Helpers.Utils.Debug("Running Memory Size Comparing-Test.."); Stopwatch sw = new Stopwatch(); sw.Start(); Loader.ArffLoader al = new Loader.ArffLoader(file); al.loadArff(); ArffInstance ai = al.getInstance(); long t = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Read & Parsed ARFF file in {0} ms.", t)); SimpleArffInstance si = new SimpleArffInstance(ai); long t2 = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Converting complex instance in {0} ms.", (t2 - t))); Helpers.Utils.Debug(string.Format("Simple Instance Size: {0} Kb.", si.GetMemorySize())); Helpers.Utils.Debug(string.Format("Complex Instance Size: {0} Kb.", ai.GetMemorySize())); Helpers.Utils.Debug(string.Format("Finished, took {0} ms.", sw.ElapsedMilliseconds)); }
public static void Evol4_BuildClassifierInstance() { Helpers.Utils.Debug("Evol4 Classifier.."); Stopwatch sw = new Stopwatch(); sw.Start(); string mysqlConStr = "SERVER=localhost;" + "DATABASE=uhs;" + "UID=root;" + "PASSWORD=pascal;"; CekaMySQL sql = new CekaMySQL(mysqlConStr); ArffInstance uhsAi = sql.tableToValidatedInstance("UHS_EXTENDED_STORIES_2", new string[] { "MNTHS_TO_1", "MNTHS_TO_2", "AGE", "STATUS", "T_NONE", "T_HORMONE", "T_SURGERY", "T_PRI_CHEMO", "T_ADJ_CHEMO", "T_ADJ_RT", "T_OOPH", "T_PLASTIC", "T_HER" }, new string[] { "MNTHS_TO_1", "MNTHS_TO_2" }, -1, -1, false, false); long em = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Gathered MySQL ArffInstance + preparing.. in {0} ms.", em)); uhsAi.deletePatternMatchingDatasets(new List <string>() { "*", "0" }); int ageIndex = uhsAi.getIndexOfAttribute("AGE"); uhsAi.rebuildAttributeValueByRange(ageIndex, 29); uhsAi.refineBackRangedAttribute(ageIndex, 3, 5); uhsAi.Relation = "uhs_ext_story_classifier_evol4"; long em2 = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Prepared classifier set in {0} ms.", (em2 - em))); new ArffSaver(uhsAi).saveInstance("uhs_ext_story_classifier_evol4"); Helpers.Utils.Debug(string.Format("Evol done, tooks {0} ms.", sw.ElapsedMilliseconds)); }
public static void Evol3_BuildAprioriInstance() { Helpers.Utils.Debug("Evol3 Apriori.."); Stopwatch sw = new Stopwatch(); sw.Start(); string mysqlConStr = "SERVER=localhost;" + "DATABASE=uhs;" + "UID=root;" + "PASSWORD=pascal;"; CekaMySQL sql = new CekaMySQL(mysqlConStr); ArffInstance uhsAi = sql.tableToInstance("UHS_EXTENDED_STORIES_2", new string[] { "MNTHS_TO_1", "MNTHS_TO_2", "AGE", "STATUS", "T_NONE", "T_HORMONE", "T_SURGERY", "T_PRI_CHEMO", "T_ADJ_CHEMO", "T_ADJ_RT", "T_OOPH", "T_PLASTIC", "T_HER" }, -1, -1, false, false); long em = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Gathered MySQL ArffInstance {0} ms.", em)); uhsAi.integrityCheck(); uhsAi.rebuildAttributeValueByRange(0, 120); uhsAi.rebuildAttributeValueByRange(1, 120); uhsAi.rebuildAttributeValueByRange(2, 16); uhsAi.deletePatternMatchingDatasets(new List <string> { "[-19<->101]" }); uhsAi.deletePatternMatchingDatasets(new List <string> { "*", "[-67<->53]" }); uhsAi.removeUnusedAttributeValues(); uhsAi.Relation = "uhs_ext_story_apriori_evol3"; long em2 = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Prepared apriori set in {0} ms.", (em2 - em))); new ArffSaver(uhsAi).saveInstance("uhs_ext_story_apriori_evol3"); Helpers.Utils.Debug(string.Format("Evol done, tooks {0} ms.", sw.ElapsedMilliseconds)); }
public static void GetDefaultDumpOfAliveDeepClean() { Helpers.Utils.Debug("Running Apriori on deep cleaned UHS alive dataset from DB.."); Stopwatch sw = new Stopwatch(); sw.Start(); string mysqlConStr = "SERVER=localhost;" + "DATABASE=uhs;" + "UID=root;" + "PASSWORD=pascal;"; CekaMySQL sql = new CekaMySQL(mysqlConStr); ArffInstance uhsAi = sql.tableToInstance("uhs_patient_story".ToUpper(), new string[] { "INITIAL_PRESENTATION", "INT_1_PRES", "INT_2_PRES", "STATUS" }, -1, -1, false, true); long em = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Gathered MySQL ArffInstance in {0} ms.", em)); uhsAi.removeDatasetsPerAttributeValue("STATUS", "N"); List <string> pattern1 = new List <string>() { "Primary|breast|cancer|(and/or|DCIS)", "Primary|breast|cancer|(and/or|DCIS)", "*" }; List <string> pattern2 = new List <string>() { "Primary|breast|cancer|(and/or|DCIS)", "UNDEFINED", "*" }; List <string> pattern3 = new List <string>() { "Primary|breast|cancer|(and/or|DCIS)", "*", "UNDEFINED" }; List <string> pattern4 = new List <string>() { "*", "UNDEFINED", "UNDEFINED" }; uhsAi.deletePatternMatchingDatasets(pattern1); uhsAi.deletePatternMatchingDatasets(pattern2); uhsAi.deletePatternMatchingDatasets(pattern3); uhsAi.deletePatternMatchingDatasets(pattern4); uhsAi.Datasets.removeEmptyValueDatasets(); long t = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Cleansed dataset in {0} ms.", t)); new ArffSaver(uhsAi).saveInstance("uhs_clean_alive"); Helpers.Utils.Debug(string.Format("Finished, took {0} ms.", sw.ElapsedMilliseconds)); sw.Stop(); }
public static void GetDefaultDumpOfAliveAndDead() { Helpers.Utils.Debug("Dumping default alive and dead ARFF files.."); Stopwatch sw = new Stopwatch(); sw.Start(); string mysqlConStr = "SERVER=localhost;" + "DATABASE=uhs;" + "UID=root;" + "PASSWORD=pascal;"; CekaMySQL sql = new CekaMySQL(mysqlConStr); ArffInstance uhsAi = sql.tableToInstance("uhs_patient_story".ToUpper(), new string[] { "INITIAL_PRESENTATION", "INT_1_PRES", "INT_2_PRES", "STATUS" }, -1, -1, false, true); long em = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Gathered MySQL ArffInstance and stored total file in {0} ms.", em)); new ArffSaver(uhsAi).saveInstance("uhs_arff_total"); ArffInstance uhsAi2 = uhsAi.toCopy(); long em2 = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Copied ArffInstance in {0} ms.", (em2 - em))); uhsAi.removeDatasetsPerAttributeValue("STATUS", "Y"); uhsAi.Relation = "uhs_arff_alive"; long em3 = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Separated Alive Set in {0} ms.", (em3 - em2))); uhsAi2.removeDatasetsPerAttributeValue("STATUS", "N"); uhsAi2.Relation = "uhs_arff_dead"; long em4 = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Separated Dead Set in {0} ms.", (em4 - em3))); new ArffSaver(uhsAi).saveInstance("uhs_arff_alive"); new ArffSaver(uhsAi2).saveInstance("uhs_arff_dead"); long em5 = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Stored both sets in files, took {0} ms.", (em5 - em4))); sql.close(); Helpers.Utils.Debug(string.Format("Dumping done, took {0} ms.", sw.ElapsedMilliseconds)); }
public static void GetDefaultMultithreadedAprioriFlexJsonResultOfAliveAndDead() { Helpers.Utils.Debug("Getting threaded Apriori Flex Results of Alive and Dead datasets.."); Stopwatch sw = new Stopwatch(); sw.Start(); string mysqlConStr = "SERVER=localhost;" + "DATABASE=uhs;" + "UID=root;" + "PASSWORD=pascal;"; CekaMySQL sql = new CekaMySQL(mysqlConStr); ArffInstance uhsAi = sql.tableToInstance("uhs_patient_story".ToUpper(), new string[] { "INITIAL_PRESENTATION", "INT_1_PRES", "INT_2_PRES", "STATUS" }, -1, -1, false, true); Helpers.Utils.Debug("Gathered MySQL ArffInstance in " + sw.ElapsedMilliseconds + "ms."); uhsAi.Relation = "uhs_arff_total"; Thread t1 = new Thread(new ThreadStart(delegate() { ArffInstance uhsAi2 = uhsAi.toCopy(); uhsAi2.removeDatasetsPerAttributeValue("STATUS", "Y"); uhsAi2.Relation = "uhs_arff_alive"; new Ceka.Algorithms.Associaters.Apriori(uhsAi2, 0.2f, 0.1f, "apriori_result_alive"); })); Thread t2 = new Thread(new ThreadStart(delegate() { ArffInstance uhsAi3 = uhsAi.toCopy(); uhsAi3.removeDatasetsPerAttributeValue("STATUS", "N"); uhsAi3.Relation = "uhs_arff_dead"; new Ceka.Algorithms.Associaters.Apriori(uhsAi3, 0.2f, 0.1f, "apriori_result_dead"); })); t1.Start(); t2.Start(); new Ceka.Algorithms.Associaters.Apriori(uhsAi, 0.2f, 0.1f, "apriori_result_total"); t1.Join(); t2.Join(); Helpers.Utils.Debug("Json Apriori Flex Result Dump done, took " + sw.ElapsedMilliseconds + "ms."); sw.Stop(); }
/// <summary> /// same as tableToInstance(), but it adds validity checking /// </summary> /// <param name="table"></param> /// <param name="table_col"></param> /// <param name="numeric"></param> /// <param name="startIndex"></param> /// <param name="endIndex"></param> /// <param name="firstUndefined"></param> /// <param name="secondUndefined"></param> /// <returns></returns> public ArffInstance tableToValidatedInstance(string table, string[] table_col, string[] numeric, int startIndex = -1, int endIndex = -1, bool firstUndefined = false, bool secondUndefined = false) { ArffInstance ai = this.tableToInstance(table, table_col, startIndex, endIndex, firstUndefined, secondUndefined); if (numeric != null) { foreach (string an in numeric) { ai.turnAttributeIntoNumeric(an, "numeric"); } } ai.Headers.CleanUp(); ai.removeUnusedAttributeValues(); ai.integrityCheck(); return(ai); }
public static void LoadArffFileAndRunAprioriWithWekaOutput(string file = "test") { Helpers.Utils.Debug(string.Format("Running apriori on ARFF file, with WEKA output, {0}.arff..", file)); Stopwatch sw = new Stopwatch(); sw.Start(); Loader.ArffLoader al = new Loader.ArffLoader(file); al.loadArff(); ArffInstance ai = al.getInstance(); ai.Datasets.removeEmptyValueDatasets(); long t = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Read & Parsed ARFF file in {0} ms.", t)); new Ceka.Algorithms.Associaters.Apriori(ai, 0.1f, 0.5f, true, true, AprioriSaveTypes.WEKA); Helpers.Utils.Debug(string.Format("Finished, took {0} ms.", sw.ElapsedMilliseconds)); sw.Stop(); }
public static void GetDefaultDumpOfComplexUHSSets() { Helpers.Utils.Debug("Dumping default (complex) Arff UHS files.."); Stopwatch sw = new Stopwatch(); sw.Start(); string mysqlConStr = "SERVER=localhost;" + "DATABASE=uhs;" + "UID=root;" + "PASSWORD=pascal;"; CekaMySQL sql = new CekaMySQL(mysqlConStr); ArffInstance uhsAi = sql.tableToInstance("UHS_EXTENDED_STORIES_2", new string[] { "AGE", "STATUS", "T_NONE", "T_HORMONE", "T_SURGERY", "T_PRI_CHEMO", "T_ADJ_CHEMO", "T_ADJ_RT", "T_OOPH", "T_PLASTIC", "T_HER" }, -1, -1, true, true); long em = sw.ElapsedMilliseconds; Helpers.Utils.Debug(string.Format("Gathered MySQL ArffInstance and stored total file in {0} ms.", em)); new ArffSaver(uhsAi).saveInstance("uhs_ext_stories_evol1"); Helpers.Utils.Debug(string.Format("Dumping done, tooks {0} ms.", sw.ElapsedMilliseconds)); }
/// <summary> /// single constructor, executes the total algorithm using default thresholds /// </summary> /// <param name="ai"></param> public Apriori(ArffInstance ai) : base() { this.algorithm_id = Apriori.APRIORI_COUNT; Apriori.APRIORI_COUNT++; source = ai; dataset_count = this.source.Datasets.Data.Count; dataset_attribute_count = this.source.Datasets.Data[0].Length(); hash = new Common.MurmurHash2Unsafe(); this.rep_n_list = new List<uint[][]>(); }
/// <summary> /// traditional apriori constructor, enabling configuration through confidence and support /// </summary> /// <param name="ai"></param> /// <param name="support"></param> /// <param name="confidence"></param> public Apriori(ArffInstance ai, float support, float confidence) : this(ai) { this.threshold_confidence = confidence; this.threshold_min_support = support; }
/// <summary> /// traditional apriori constructor that writes the result straight to a json file /// </summary> /// <param name="ai"></param> /// <param name="support"></param> /// <param name="confidence"></param> /// <param name="file"></param> public Apriori(ArffInstance ai, float support, float confidence, string file) : this(ai, support, confidence) { this.run_default_process(); new Ceka.Saver.SimpleJsonSaver(this.get_aobj_result(), true).SaveToFile(file + ".json"); }
/// <summary> /// traditional apriori constructor that writes the json result to a file but leaves the possibility to disable filters /// </summary> /// <param name="ai"></param> /// <param name="support"></param> /// <param name="confidence"></param> /// <param name="filterSupport"></param> /// <param name="filterConfidence"></param> /// <param name="file"></param> /// <param name="prettyJson"></param> public Apriori(ArffInstance ai, float support, float confidence, bool filterSupport, bool filterConfidence, string file, bool prettyJson) : this(ai, support, confidence) { this.filterForSupport = filterSupport; this.filterForConfidence = filterConfidence; this.run_default_process(filterSupport, filterConfidence); new Ceka.Saver.SimpleJsonSaver(this.get_aobj_result(), prettyJson).SaveToFile(file + ".json"); }
/// <summary> /// similar to the traditional constructor with filter options, but leaves options for different result saving /// </summary> /// <param name="ai"></param> /// <param name="support"></param> /// <param name="confidence"></param> /// <param name="filterSupport"></param> /// <param name="filterConfidence"></param> /// <param name="savt"></param> public Apriori(ArffInstance ai, float support, float confidence, bool filterSupport, bool filterConfidence, Saver.AprioriSaveTypes savt, bool cli = false, string outputFile = null) : this(ai, support, confidence) { this.filterForSupport = filterSupport; this.filterForConfidence = filterConfidence; this.run_default_process(filterSupport, filterConfidence); string file = this.source.Relation + "_result"; if (outputFile != null) file = outputFile; switch (savt) { case Saver.AprioriSaveTypes.JSON: if (!cli) new Saver.SimpleJsonSaver(this.get_aobj_result(), false).SaveToFile(file + ".json"); else new Saver.SimpleJsonSaver(this.get_aobj_result(), false).CLI(); break; case Saver.AprioriSaveTypes.JSON_PRETTY: if (!cli) new Saver.SimpleJsonSaver(this.get_aobj_result(), true).SaveToFile(file + ".json"); else new Saver.SimpleJsonSaver(this.get_aobj_result(), true).CLI(); break; case Saver.AprioriSaveTypes.WEKA: if (!cli) new Saver.WekaAssociationRulesSaver(this.get_aobj_result()).SaveToFile(file + ".ceka"); else new Saver.WekaAssociationRulesSaver(this.get_aobj_result()).CLI(); break; case Saver.AprioriSaveTypes.NONE: log("Apriori finished, doing nothing as SaveType is NONE."); break; default: log("Apriori finished, but SaveType is DEFAULT: " + savt.ToString()); break; } }
/// <summary> /// turns table structur into attributes and table rows (content) into data rows /// </summary> /// <param name="table">the mysql database table</param> /// <param name="table_col">a list of columns names that are to be read into the arff instance</param> /// <param name="startIndex">table row start index; use -1 for no limit</param> /// <param name="endIndex">table row end index; use -1 for no limit</param> /// <param name="firstUndefined">can the first column of the passed column array be NULL</param> /// <param name="secondUndefined">can the second column of the passed column array be NULL</param> /// <returns>returns the generated arff instance</returns> public ArffInstance tableToInstance(string table, string[] table_col, int startIndex = -1, int endIndex = -1, bool firstUndefined = false, bool secondUndefined = false) { if (table_col == null || table_col.Length < 2) throw new CekaException("can not create an ArffInstance from a Table with less then 2 columns."); StringBuilder sb = new StringBuilder(); sb.Append("SELECT "); for(int i = 0; i < table_col.Length; i++) { sb.Append(table_col[i]); if(i != (table_col.Length - 1)) sb.Append(", "); } sb.Append(" FROM "); sb.Append(table); if (!firstUndefined) { sb.Append(" WHERE "); sb.Append(table_col[0]); if (!secondUndefined) { sb.Append(" IS NOT NULL AND "); sb.Append(table_col[1]); sb.Append(" IS NOT NULL"); } else { sb.Append(" IS NOT NULL"); } } MySqlDataReader mr = query(sb.ToString()); ArffInstance ai = new ArffInstance("ceka_" + table); foreach (string s in table_col) //setup attributes from table columns ai.addAttribute(s, this.getDistinctOccurencesInColumn(table, s)); //gather data from select resultset into a library dataset int c = 0; string[] sa = new string[table_col.Length]; while (mr.Read()) { if ((startIndex == -1 || c >= startIndex) && (endIndex == -1 || c <= endIndex)) //make sure to run in limes { for (int i = 0; i < mr.FieldCount; i++) { sa[i] = mr.GetValue(i).ToString(); if (string.IsNullOrWhiteSpace(sa[i]) || string.IsNullOrEmpty(sa[i])) sa[i] = ArffFile.ATT_UNDEFINED; //make sure to give these a value at all times else if (sa[i].Contains(ArffFile.ARFF_SPACE)){ //also make sure there a no whitespaces at all times sa[i] = sa[i].Replace(ArffFile.ARFF_SPACE, ArffFile.ATT_SPACE_EXCHANGE); if (string.IsNullOrEmpty(sa[i]) || string.IsNullOrWhiteSpace(sa[i])) { sa[i] = ArffFile.ATT_UNDEFINED; } } } ai.addDataset(sa); sa = new string[table_col.Length]; } c++; } mr.Close(); return ai; }
/// <summary> /// turns table structur into attributes and table rows (content) into data rows /// </summary> /// <param name="table">the mysql database table</param> /// <param name="table_col">a list of columns names that are to be read into the arff instance</param> /// <param name="startIndex">table row start index; use -1 for no limit</param> /// <param name="endIndex">table row end index; use -1 for no limit</param> /// <param name="firstUndefined">can the first column of the passed column array be NULL</param> /// <param name="secondUndefined">can the second column of the passed column array be NULL</param> /// <returns>returns the generated arff instance</returns> public ArffInstance tableToInstance(string table, string[] table_col, int startIndex = -1, int endIndex = -1, bool firstUndefined = false, bool secondUndefined = false) { if (table_col == null || table_col.Length < 2) { throw new CekaException("can not create an ArffInstance from a Table with less then 2 columns."); } StringBuilder sb = new StringBuilder(); sb.Append("SELECT "); for (int i = 0; i < table_col.Length; i++) { sb.Append(table_col[i]); if (i != (table_col.Length - 1)) { sb.Append(", "); } } sb.Append(" FROM "); sb.Append(table); if (!firstUndefined) { sb.Append(" WHERE "); sb.Append(table_col[0]); if (!secondUndefined) { sb.Append(" IS NOT NULL AND "); sb.Append(table_col[1]); sb.Append(" IS NOT NULL"); } else { sb.Append(" IS NOT NULL"); } } MySqlDataReader mr = query(sb.ToString()); ArffInstance ai = new ArffInstance("ceka_" + table); foreach (string s in table_col) //setup attributes from table columns { ai.addAttribute(s, this.getDistinctOccurencesInColumn(table, s)); } //gather data from select resultset into a library dataset int c = 0; string[] sa = new string[table_col.Length]; while (mr.Read()) { if ((startIndex == -1 || c >= startIndex) && (endIndex == -1 || c <= endIndex)) //make sure to run in limes { for (int i = 0; i < mr.FieldCount; i++) { sa[i] = mr.GetValue(i).ToString(); if (string.IsNullOrWhiteSpace(sa[i]) || string.IsNullOrEmpty(sa[i])) { sa[i] = ArffFile.ATT_UNDEFINED; //make sure to give these a value at all times } else if (sa[i].Contains(ArffFile.ARFF_SPACE)) //also make sure there a no whitespaces at all times { sa[i] = sa[i].Replace(ArffFile.ARFF_SPACE, ArffFile.ATT_SPACE_EXCHANGE); if (string.IsNullOrEmpty(sa[i]) || string.IsNullOrWhiteSpace(sa[i])) { sa[i] = ArffFile.ATT_UNDEFINED; } } } ai.addDataset(sa); sa = new string[table_col.Length]; } c++; } mr.Close(); return(ai); }
/// <summary> /// main constructor /// </summary> /// <param name="instance">the arff instance you want to write to a file</param> public ArffSaver(ArffInstance instance) { this.Instance = instance; }