public void AddPaths(DcColumn sd, DcColumn td, Mapping gMapping) // Add this pair by expanding it using the mapping { Debug.Assert(sd != null && sd.Input == SourceTab, "Wrong use: source path must start from the source table."); Debug.Assert(td != null && td.Input == TargetTab, "Wrong use: target path must start from the target table."); Debug.Assert(sd != null && sd.Output == gMapping.SourceTab, "Wrong use: source path must end where the mapping starts."); Debug.Assert(td != null && td.Output == gMapping.TargetTab, "Wrong use: target path must end where the mapping starts."); if (gMapping.Matches.Count == 0) // If there are no continuations then add only the starting segments (for example, for mappings between primitive tables) { ColumnPath sp = new ColumnPath(); // A path consists of one segment sp.InsertLast(sd); ColumnPath tp = new ColumnPath(); // A path consists of one segment tp.InsertLast(td); PathMatch match = new PathMatch(sp, tp); Matches.Add(match); } foreach (PathMatch gMatch in gMapping.Matches) { ColumnPath sp = new ColumnPath(); // Create source path by concatenating one segment and continuation path from the mapping sp.InsertLast(sd); sp.InsertLast(gMatch.SourcePath); ColumnPath tp = new ColumnPath(); // Create target path by concatenating one segment and continuation path from the mapping tp.InsertLast(td); tp.InsertLast(gMatch.TargetPath); PathMatch match = new PathMatch(sp, tp); Matches.Add(match); } }
public ColumnTree FindPath(ColumnPath path) // Find a node corresponding to the path. { Debug.Assert(path != null && path.Input == Set, "Wrong use: path must start from the node it is added to."); if (path.Segments == null || path.Segments.Count == 0) { return(null); } DcColumn seg; ColumnTree node = this; for (int i = 0; i < path.Segments.Count; i++) // We try to find segments sequentially { seg = path.Segments[i]; ColumnTree child = node.GetChild(seg); // Find a child corresponding to this segment if (child == null) // Add a new child corresponding to this segment { return(null); } node = child; } return(node); }
public ColumnTree AddPath(ColumnPath path) // Find or create nodes corresponding to the path. { Debug.Assert(path != null && path.Input == Set, "Wrong use: path must start from the node it is added to."); if (path.Segments == null || path.Segments.Count == 0) { return(null); } DcColumn seg; ColumnTree node = this; for (int i = 0; i < path.Segments.Count; i++) // We add all segments sequentially { seg = path.Segments[i]; ColumnTree child = node.GetChild(seg); // Find a child corresponding to this segment if (child == null) // Add a new child corresponding to this segment { child = (ColumnTree)Activator.CreateInstance(node.GetType()); child.Column = seg; node.AddChild(child); } node = child; } return(node); }
public bool IsSourcePathValid(ColumnPath path) { if (path.Input != SourceTab && !SourceTab.IsInput(path.Input)) { return(false); } return(true); }
public bool IsTargetPathValid(ColumnPath path) { if (path.Input != TargetTab && !TargetTab.IsInput(path.Input)) { return(false); } return(true); }
public void InsertFirst(ColumnPath path) // Insert new segments from the specified path at the beginning of the path { Debug.Assert(Size == 0 || path.Output == Input, "A path must continue the first segment inserted in the beginning."); Segments.InsertRange(0, path.Segments); Input = path.Input; if (Output == null) { Output = path.Output; } }
public PathMatch GetMatchForTarget(ColumnPath path) // Find a match with this path { foreach (PathMatch m in Matches) { if (m.MatchesTarget(path)) { return(m); } } return(null); }
public void RemoveFirst(ColumnPath sourcePath, ColumnPath targetPath) { if (sourcePath != null) { _sourceTab = sourcePath.Output; Matches.ForEach(m => m.SourcePath.RemoveFirst(sourcePath)); } if (targetPath != null) { _targetTab = targetPath.Output; Matches.ForEach(m => m.TargetPath.RemoveFirst(targetPath)); } }
public void RemoveMatch(ColumnPath sourcePath, ColumnPath targetPath) // Remove the specified and all more specific matches (continuations) { Debug.Assert(sourcePath.Input == SourceTab, "Wrong use: source path must start from the source table."); Debug.Assert(targetPath.Input == TargetTab, "Wrong use: target path must start from the target table."); List <PathMatch> toRemove = new List <PathMatch>(); foreach (PathMatch m in Matches) { // If existing match is the same or more specific than the specified match to be removed if (m.MatchesSource(sourcePath) && m.MatchesTarget(targetPath)) { toRemove.Add(m); } } toRemove.ForEach(m => Matches.Remove(m)); }
public ColumnPath SubPath(int index, int count = 0) // Return a new path consisting of the specified segments { ColumnPath ret = new ColumnPath(); if (count == 0) { count = Segments.Count - index; } for (int i = 0; i < count; i++) { ret.Segments.Add(Segments[index + i]); } ret.Output = ret.Segments[0].Input; ret.Input = ret.Segments[ret.Segments.Count - 1].Output; return(ret); }
public void InsertLast(ColumnPath path) // Append all segments of the specified path to the end of this path { Debug.Assert(Size == 0 || path.Input == Output, "A an appended path must continue this path."); if (path == null || path.Size == 0) { return; } for (int i = 0; i < path.Segments.Count; i++) { Segments.Add(path.Segments[i]); } Output = path.Output; if (Input == null) { Input = path.Input; } }
public void RemoveFirst(ColumnPath path) // Remove first segments { if (Segments.Count < path.Segments.Count) { return; // Nothing to remove } if (!this.StartsWith(path)) { return; } Segments.RemoveRange(0, path.Segments.Count); if (Segments.Count > 0) { Input = Segments[0].Input; } else { Input = Output; } }
/// <summary> /// Find best path starting from the target set and corresponding to the source path. /// </summary> public ColumnPath MapCol(ColumnPath sourcePath, DcTable targetSet) { List <ColumnPath> targetPaths = (new PathEnumerator(targetSet, ColumnType.IDENTITY_ENTITY)).ToList(); if (targetPaths.Count == 0) { return(null); } ColumnPath bestTargetPath = null; double bestSimilarity = Double.MinValue; foreach (ColumnPath targetPath in targetPaths) { double similarity = StringSimilarity.ComputePathSimilarity(sourcePath, targetPath); if (similarity > bestSimilarity) { bestSimilarity = similarity; bestTargetPath = targetPath; } } return(bestTargetPath); }
/// <summary> /// Build mappings from the source set to the target set. The tables are greater tables of the specified columns. /// The mapping should take into account (semantically) that these tables are used from these columns. /// </summary> public List <Mapping> MapCol(ColumnPath sourcePath, ColumnPath targetPath) { // We analyze all continuations of the specified prefix paths List <ColumnPath> sourcePaths = (new PathEnumerator(sourcePath.Output, ColumnType.IDENTITY_ENTITY)).ToList(); sourcePaths.ForEach(p => p.InsertFirst(sourcePath)); if (sourcePaths.Count == 0) { sourcePaths.Add(sourcePath); } List <ColumnPath> targetPaths = (new PathEnumerator(targetPath.Output, ColumnType.IDENTITY_ENTITY)).ToList(); targetPaths.ForEach(p => p.InsertFirst(targetPath)); if (targetPaths.Count == 0) { targetPaths.Add(targetPath); } List <Mapping> mappings = new List <Mapping>(); int colCount = sourcePaths.Count(); var matches = new List <Tuple <ColumnPath, List <ColumnPath> > >(); // List of: <srcPath, targetPaths> int[] lengths = new int[colCount]; // Each column has some length (some valid target paths) for (int i = 0; i < colCount; i++) { ColumnPath sp = sourcePaths[i]; List <ColumnPath> tps = new List <ColumnPath>(); // Sort target paths according to their similarity tps.AddRange(targetPaths); tps = tps.OrderByDescending(p => StringSimilarity.ComputePathSimilarity(sp, p)).ToList(); if (tps.Count > MaxPossibleTargetPaths) // Leave only top n target paths with the best similarity { tps.RemoveRange(MaxPossibleTargetPaths, tps.Count - MaxPossibleTargetPaths); } // TODO: Cut the tail with similarity less than MinPathSimilarity matches.Add(Tuple.Create(sp, tps)); lengths[i] = tps.Count; } int[] offsets = new int[colCount]; // Here we store the current state of choices for each columns (target path number) for (int i = 0; i < colCount; i++) { offsets[i] = -1; } int top = -1; // The current level/top where we change the offset. Depth of recursion. do { ++top; } while (top < colCount && lengths[top] == 0); int mappingsBuilt = 0; // The number of all hypothesis (mappings) built and checked Func <int, Mapping> BuildSetMapping = delegate(int sourcePathCount) { bool withPrefix = true; Mapping mapping; if (withPrefix) { mapping = new Mapping(sourcePath.Input, targetPath.Input); } else { mapping = new Mapping(sourcePath.Output, targetPath.Output); } for (int i = 0; i < sourcePathCount; i++) { if (offsets[i] < 0 || offsets[i] >= lengths[i]) { continue; } ColumnPath sp = matches[i].Item1; if (!withPrefix) { sp.RemoveFirst(); } ColumnPath tp = matches[i].Item2[offsets[i]]; if (!withPrefix) { tp.RemoveFirst(); } mapping.AddMatch(new PathMatch(sp, tp)); } return(mapping); }; while (top >= 0) { if (top == colCount) // Element is ready. Process new element. { if (++mappingsBuilt > MaxMappingsToBuild) { break; } // Check coverage. However many source paths have been assigned a non-null target path double coverage = 0; for (int i = 0; i < top; i++) { if (offsets[i] >= 0 && offsets[i] < lengths[i]) { coverage += 1; } } coverage /= colCount; if (coverage >= MinSourcePathsMatched) { // Evaluate the whole mapping (aggregated quality with coverage and other parameters) Mapping currentMapping = BuildSetMapping(top); currentMapping.ComputeSimilarity(); currentMapping.Similarity *= coverage; if (currentMapping.Similarity >= MinSetMappingQuality) { mappings.Add(currentMapping); } } top--; while (top >= 0 && (offsets[top] >= lengths[top] || lengths[top] == 0)) // Go up by skipping finished and empty columns { offsets[top--] = -1; } } else // Find the next valid offset { Mapping currentMapping = BuildSetMapping(top); for (offsets[top]++; offsets[top] < lengths[top]; offsets[top]++) { ColumnPath sp = matches[top].Item1; ColumnPath tp = matches[top].Item2[offsets[top]]; // New target path bool canUse = true; // Check if it has not been already used as a target for previous paths for (int i = 0; i < top; i++) { if (offsets[i] < 0 || offsets[i] >= lengths[i]) { continue; } ColumnPath usedtp = matches[i].Item2[offsets[i]]; // Used target path (by i-th source path) if (usedtp == tp) { canUse = false; break; } } if (!canUse) { continue; } canUse = currentMapping.Compatible(new PathMatch(sp, tp)); if (!canUse) { continue; } break; // Found } // Offset chosen. Go foreward by skipping empty columns. top++; while (top < colCount && (offsets[top] >= lengths[top] || lengths[top] == 0)) // Go up (foreward) by skipping finished and empty columns { top++; } } } mappings = mappings.OrderByDescending(m => m.Similarity).ToList(); // Remove prefixes foreach (Mapping m in mappings) { m.RemoveFirst(sourcePath, targetPath); } Mappings.AddRange(mappings); return(mappings); }
public bool SamePath(ColumnPath path) // Equals (the same segments) { return(SamePath(path.Segments)); }
public PathMatch(PathMatch m) { SourcePath = new ColumnPath(m.SourcePath); TargetPath = new ColumnPath(m.TargetPath); Similarity = m.Similarity; }
public PathMatch(ColumnPath sourcePath, ColumnPath targetPath, double similarity) { SourcePath = sourcePath; TargetPath = targetPath; Similarity = similarity; }
public PathMatch(ColumnPath sourcePath, ColumnPath targetPath) : this(sourcePath, targetPath, 1.0) { }
public bool MatchesTarget(ColumnPath path) // This is more specific (longer) than argument { return(TargetPath.StartsWith(path)); }
public void InsertAt(ColumnPath path) // Insert a new segment at the specified position { throw new NotImplementedException(); }
public bool StartsWith(ColumnPath path) { return(StartsWith(path.Segments)); }
public ColumnPath(ColumnPath path) : base(path) { Segments = new List <DcColumn>(); Segments.AddRange(path.Segments); }
public void RemoveLast(ColumnPath path) // Remove last segments (suffix) { throw new NotImplementedException(); }
/// <summary> /// Create and initialize a new mapping which produces a flat target set with all primitive columns for copying primitive data from the source set. /// Only identity (PK) source columns are expanded recursively. /// For relational source, this means that all primitive columns of the source table will be mapped with their relational names, no FK-referenced tables will be joined and no artifical column names will be used. /// If it is necessary to expand entity columns (non-PK columns of joined tables) then a different implementation is needed (which will require joins, artifical column/path names etc.) /// </summary> public Mapping CreatePrimitive(DcTable sourceSet, DcTable targetSet, DcSchema targetSchema) { Debug.Assert(!sourceSet.IsPrimitive && !targetSet.IsPrimitive, "Wrong use: copy mapping can be created for only non-primitive tables."); Debug.Assert(targetSchema != null || targetSet.Schema != null, "Wrong use: target schema must be specified."); Mapping map = new Mapping(sourceSet, targetSet); DcSchema sourceSchema = map.SourceTab.Schema; if (targetSchema == null) { targetSchema = targetSet.Schema; } ColumnPath sp; ColumnPath tp; DcColumn td; PathMatch match; if (sourceSchema is SchemaOledb) { TableRel set = (TableRel)map.SourceTab; foreach (ColumnAtt att in set.GreaterPaths) { sp = new ColumnAtt(att); // Recommend matching target type (mapping primitive types) this.MapPrimitiveSet(att.Output, targetSchema); DcTable targetType = this.GetBestTargetSet(att.Output, targetSchema); td = new Schema.Column(att.RelationalColumnName, map.TargetTab, targetType, att.IsKey, false); tp = new ColumnPath(td); tp.Name = sp.Name; match = new PathMatch(sp, tp, 1.0); map.Matches.Add(match); } } else if (sourceSchema is SchemaCsv) { DcTable set = (DcTable)map.SourceTab; foreach (DcColumn sd in set.Columns) { if (sd.IsSuper) { continue; } // Recommend matching target type (mapping primitive types) //this.MapPrimitiveSet(sd, targetSchema); //ComTable targetType = this.GetBestTargetSet(sd.Output, targetSchema); // // Analyze sample values of sd and choose the most specific target type // List <string> values = ((ColumnCsv)sd).SampleValues; string targetTypeName; if (Com.Schema.Utils.isInt32(values.ToArray())) { targetTypeName = "Integer"; } else if (Com.Schema.Utils.isDouble(values.ToArray())) { targetTypeName = "Double"; } else { targetTypeName = "String"; } DcTable targetType = targetSchema.GetPrimitiveType(targetTypeName); td = targetSchema.Space.CreateColumn(sd.Name, map.TargetTab, targetType, sd.IsKey); sp = new ColumnPath(sd); tp = new ColumnPath(td); match = new PathMatch(sp, tp, 1.0); map.Matches.Add(match); } } return(map); }
public int IndexOf(ColumnPath path) // Return index of the beginning of the specified path in this path { throw new NotImplementedException(); }