Ejemplo n.º 1
0
        public double ComputeSimilarity()
        {
            double sum = 0.0;

            foreach (PathMatch m in Matches)
            {
                m.Similarity = StringSimilarity.ComputePathSimilarity(m.SourcePath, m.TargetPath);
                sum         += m.Similarity;
            }

            if (Matches.Count > 0)
            {
                Similarity = sum / Matches.Count;
            }
            else
            {
                Similarity = 0.0;
            }

            return(Similarity);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Find best path starting from the target set and corresponding to the source path.
        /// </summary>
        public ColumnPath MapCol(ColumnPath sourcePath, DcTable targetSet)
        {
            List <ColumnPath> targetPaths = (new PathEnumerator(targetSet, ColumnType.IDENTITY_ENTITY)).ToList();

            if (targetPaths.Count == 0)
            {
                return(null);
            }

            ColumnPath bestTargetPath = null;
            double     bestSimilarity = Double.MinValue;

            foreach (ColumnPath targetPath in targetPaths)
            {
                double similarity = StringSimilarity.ComputePathSimilarity(sourcePath, targetPath);
                if (similarity > bestSimilarity)
                {
                    bestSimilarity = similarity;
                    bestTargetPath = targetPath;
                }
            }

            return(bestTargetPath);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Build mappings from the source set to the target set. The tables are greater tables of the specified columns.
        /// The mapping should take into account (semantically) that these tables are used from these columns.
        /// </summary>
        public List <Mapping> MapCol(ColumnPath sourcePath, ColumnPath targetPath)
        {
            // We analyze all continuations of the specified prefix paths
            List <ColumnPath> sourcePaths = (new PathEnumerator(sourcePath.Output, ColumnType.IDENTITY_ENTITY)).ToList();

            sourcePaths.ForEach(p => p.InsertFirst(sourcePath));
            if (sourcePaths.Count == 0)
            {
                sourcePaths.Add(sourcePath);
            }

            List <ColumnPath> targetPaths = (new PathEnumerator(targetPath.Output, ColumnType.IDENTITY_ENTITY)).ToList();

            targetPaths.ForEach(p => p.InsertFirst(targetPath));
            if (targetPaths.Count == 0)
            {
                targetPaths.Add(targetPath);
            }

            List <Mapping> mappings = new List <Mapping>();

            int colCount = sourcePaths.Count();

            var matches = new List <Tuple <ColumnPath, List <ColumnPath> > >(); // List of: <srcPath, targetPaths>

            int[] lengths = new int[colCount];                                  // Each column has some length (some valid target paths)
            for (int i = 0; i < colCount; i++)
            {
                ColumnPath        sp  = sourcePaths[i];
                List <ColumnPath> tps = new List <ColumnPath>();

                // Sort target paths according to their similarity
                tps.AddRange(targetPaths);
                tps = tps.OrderByDescending(p => StringSimilarity.ComputePathSimilarity(sp, p)).ToList();
                if (tps.Count > MaxPossibleTargetPaths) // Leave only top n target paths with the best similarity
                {
                    tps.RemoveRange(MaxPossibleTargetPaths, tps.Count - MaxPossibleTargetPaths);
                }

                // TODO: Cut the tail with similarity less than MinPathSimilarity

                matches.Add(Tuple.Create(sp, tps));
                lengths[i] = tps.Count;
            }

            int[] offsets = new int[colCount]; // Here we store the current state of choices for each columns (target path number)
            for (int i = 0; i < colCount; i++)
            {
                offsets[i] = -1;
            }

            int top = -1; // The current level/top where we change the offset. Depth of recursion.

            do
            {
                ++top;
            } while (top < colCount && lengths[top] == 0);

            int mappingsBuilt = 0; // The number of all hypothesis (mappings) built and checked

            Func <int, Mapping> BuildSetMapping = delegate(int sourcePathCount)
            {
                bool    withPrefix = true;
                Mapping mapping;
                if (withPrefix)
                {
                    mapping = new Mapping(sourcePath.Input, targetPath.Input);
                }
                else
                {
                    mapping = new Mapping(sourcePath.Output, targetPath.Output);
                }

                for (int i = 0; i < sourcePathCount; i++)
                {
                    if (offsets[i] < 0 || offsets[i] >= lengths[i])
                    {
                        continue;
                    }

                    ColumnPath sp = matches[i].Item1;
                    if (!withPrefix)
                    {
                        sp.RemoveFirst();
                    }
                    ColumnPath tp = matches[i].Item2[offsets[i]];
                    if (!withPrefix)
                    {
                        tp.RemoveFirst();
                    }

                    mapping.AddMatch(new PathMatch(sp, tp));
                }

                return(mapping);
            };

            while (top >= 0)
            {
                if (top == colCount) // Element is ready. Process new element.
                {
                    if (++mappingsBuilt > MaxMappingsToBuild)
                    {
                        break;
                    }

                    // Check coverage. However many source paths have been assigned a non-null target path
                    double coverage = 0;
                    for (int i = 0; i < top; i++)
                    {
                        if (offsets[i] >= 0 && offsets[i] < lengths[i])
                        {
                            coverage += 1;
                        }
                    }

                    coverage /= colCount;

                    if (coverage >= MinSourcePathsMatched)
                    {
                        // Evaluate the whole mapping (aggregated quality with coverage and other parameters)
                        Mapping currentMapping = BuildSetMapping(top);

                        currentMapping.ComputeSimilarity();
                        currentMapping.Similarity *= coverage;
                        if (currentMapping.Similarity >= MinSetMappingQuality)
                        {
                            mappings.Add(currentMapping);
                        }
                    }

                    top--;
                    while (top >= 0 && (offsets[top] >= lengths[top] || lengths[top] == 0)) // Go up by skipping finished and empty columns
                    {
                        offsets[top--] = -1;
                    }
                }
                else // Find the next valid offset
                {
                    Mapping currentMapping = BuildSetMapping(top);

                    for (offsets[top]++; offsets[top] < lengths[top]; offsets[top]++)
                    {
                        ColumnPath sp = matches[top].Item1;
                        ColumnPath tp = matches[top].Item2[offsets[top]]; // New target path

                        bool canUse = true;

                        // Check if it has not been already used as a target for previous paths
                        for (int i = 0; i < top; i++)
                        {
                            if (offsets[i] < 0 || offsets[i] >= lengths[i])
                            {
                                continue;
                            }
                            ColumnPath usedtp = matches[i].Item2[offsets[i]]; // Used target path (by i-th source path)
                            if (usedtp == tp)
                            {
                                canUse = false; break;
                            }
                        }
                        if (!canUse)
                        {
                            continue;
                        }

                        canUse = currentMapping.Compatible(new PathMatch(sp, tp));
                        if (!canUse)
                        {
                            continue;
                        }

                        break; // Found
                    }

                    // Offset chosen. Go foreward by skipping empty columns.
                    top++;
                    while (top < colCount && (offsets[top] >= lengths[top] || lengths[top] == 0)) // Go up (foreward) by skipping finished and empty columns
                    {
                        top++;
                    }
                }
            }

            mappings = mappings.OrderByDescending(m => m.Similarity).ToList();

            // Remove prefixes
            foreach (Mapping m in mappings)
            {
                m.RemoveFirst(sourcePath, targetPath);
            }

            Mappings.AddRange(mappings);
            return(mappings);
        }