Ejemplo n.º 1
0
        /// <summary>
        /// Generate best mappings from the specified source set to all possible target tables in the specified schema.
        /// Best mappings from the source greater tables will be (re)used and created if they do not already exist in the mapper.
        /// </summary>
        public List <Mapping> MapSet(DcTable sourceSet, DcSchema targetSchema)
        {
            if (sourceSet.IsPrimitive)
            {
                return(MapPrimitiveSet((Schema.Table)sourceSet, targetSchema));
            }
            DcSchema       sourceSchema = sourceSet.Schema;
            List <Mapping> maps         = new List <Mapping>();

            Dictionary <DcColumn, Mapping> greaterMappings = new Dictionary <DcColumn, Mapping>();

            //
            // 1. Find target greater tables. They are found among mappings and hence can contain both existing (in the schema) and new tables.
            //
            List <DcTable> targetOutputTabs = new List <DcTable>();

            foreach (DcColumn sd in sourceSet.Columns)
            {
                Mapping gMapping = GetBestMapping(sd.Output, targetSchema);

                if (gMapping == null)                                   // Either does not exist or cannot be built (for example, formally not possible or meaningless)
                {
                    MapSet(sd.Output, targetSchema);                    // Recursion up to primitive tables if not computed and stored earlier
                    gMapping = GetBestMapping(sd.Output, targetSchema); // Try again after generation
                }

                greaterMappings.Add(sd, gMapping);

                targetOutputTabs.Add(gMapping != null ? gMapping.TargetTab : null);
            }

            //
            // 2. Now find the best (existing) lesser set for the target greater tables. The best set should cover most of them by its greater columns
            //
            List <DcTable> allTargetTabs = targetSchema.AllSubTables;

            double[] coverage         = new double[allTargetTabs.Count];
            double   maxCoverage      = 0;
            int      maxCoverageIndex = -1;

            for (int i = 0; i < allTargetTabs.Count; i++)
            {
                // Find coverage of this target set (how many best greater target tables it covers)
                coverage[i] = 0;
                foreach (DcColumn tgc in allTargetTabs[i].Columns)
                {
                    DcTable tgs = tgc.Output;
                    if (!targetOutputTabs.Contains(tgs))
                    {
                        continue;
                    }

                    // TODO: Compare column names and then use it as a weight [0,1] instead of simply incrementing
                    coverage[i] += 1;
                }
                coverage[i] /= targetOutputTabs.Count; // Normalize to [0,1]
                if (coverage[i] > 1)
                {
                    coverage[i] = 1;                  // A lesser set can use (reference, cover) a greater set more than once
                }
                // Take into account individual similarity of the target set with the source set
                double nameSimilarity = StringSimilarity.ComputeStringSimilarity(sourceSet.Name, allTargetTabs[i].Name, 3);
                coverage[i] *= nameSimilarity;

                // TODO: Take into account difference in max ranks

                if (coverage[i] > maxCoverage)
                {
                    maxCoverage      = coverage[i];
                    maxCoverageIndex = i;
                }
            }

            //
            // 3. Create and store a mapping (or several mappings)
            //
            Mapping newMapping = null;

            if (maxCoverage < SetCreationThreshold)                             // Create new target set for mapping (and its greater columns) which will be accessible only via the mapping object (not via the schema)
            {
                DcTable ts = new Schema.Table(sourceSet.Name, sourceSet.Space); // New set has the same name as the soure set

                newMapping = new Mapping(sourceSet, ts);

                foreach (DcColumn sd in sourceSet.Columns) // For each source column, create one new target column
                {
                    Mapping gMapping = greaterMappings[sd];
                    DcTable gts      = gMapping.TargetTab;

                    DcColumn td = targetSchema.Space.CreateColumn(sd.Name, ts, gts, sd.IsKey); // Create a clone for the source column

                    newMapping.AddPaths(sd, td, gMapping);                                     // Add a pair of columns as a match (with expansion using the specified greater mapping)
                }

                newMapping.Similarity = 1.0;
                maps.Add(newMapping);
            }
            else // Use existing target set(s) for mapping(s)
            {
                DcTable ts = allTargetTabs[maxCoverageIndex];

                newMapping = new Mapping(sourceSet, ts);

                foreach (DcColumn sd in sourceSet.Columns) // For each source column, find best target column
                {
                    Mapping gMapping = greaterMappings[sd];
                    DcTable gts      = gMapping.TargetTab;

                    // Find an existing column from ts to gts with the best similarity to source col sd
                    DcColumn td    = null;
                    var      tCols = ts.Columns.Where(d => d.Output == gts); // All target columns from ts to gts
                    if (tCols != null && tCols.Count() > 0)
                    {
                        // TODO: In fact, we need to choose the best column, for example, by comparing their names, usages, ranks and other semantic factors
                        td = tCols.ToList()[0];
                    }

                    if (td == null) // No good target column found (the source column is not covered)
                    {
                        continue;   // TODO: Maybe create a new target column rather than simply ingnoring it
                    }

                    //td.IsIdentity = sd.IsIdentity;

                    newMapping.AddPaths(sd, td, gMapping); // Add a pair of columnss as a match (with expansion using the specified greater mapping)
                }

                newMapping.Similarity = maxCoverage;
                maps.Add(newMapping);
            }

            Mappings.AddRange(maps);
            return(maps);
        }