Exemplo n.º 1
0
        /// <summary>
        /// Imports nodes and edges data into GraphView.
        /// Runs the following command to enable minimal logging,
        /// which will highly enhance the performance of bulk loading:
        /// USE master; ALTER DATABASE database_name SET RECOVERY BULK_LOGGED;
        /// </summary>
        /// <param name="nodesFileName"> The list of node file name(s)</param>
        /// <param name="edgesFileName"> the list of edge file name(s)</param>
        /// <param name="directory"> The directory of the node and edge data files</param>
        /// <param name="skipScanLabel"> True, notifies GraphView that every node file has only one label and 
        /// every edge file has only one type. This will improve the performance of importing.</param>
        /// <param name="fieldTerminator"> The field terminator of data files</param>
        /// <param name="byDefaultType"> The default data type.</param>
        public void Import(IList<string> nodesFileName, IList<string> edgesFileName, string directory,
            bool skipScanLabel = false, string fieldTerminator = ",", string byDefaultType = "string")
        {
            if (!string.IsNullOrEmpty(directory))
            {
                if (Directory.Exists(directory))
                {
                    nodesFileName = nodesFileName.Select(x => directory + "\\" + x).ToList();
                    edgesFileName = edgesFileName.Select(x => directory + "\\" + x).ToList();
                }
                else
                {
                    throw new BulkInsertNodeException(String.Format("The directory {0} does not exist.", directory));
                }
            }
            if (FileInfo.TypeDict.ContainsKey(byDefaultType.ToLower()))
            {
                byDefaultType = FileInfo.TypeDict[byDefaultType.ToLower()];
            }
            else
            {
                throw new BulkInsertNodeException("The type by default is not supported. The type supported includes:\n" +
                                                  "int,long,float,double,boolean,byt,short,char,string\n");
            }
            FileInfo.FieldTerminator = fieldTerminator;
            FileInfo.RowTerminator = "\r\n";
            FileInfo.ByDefaultType = byDefaultType;
            FileInfo.SkipScanLabel = skipScanLabel;

            //Collects file header's information
            var nodeFileToInfo = new Dictionary<string, NodeFileInfo>();
            foreach (var it in nodesFileName)
            {
                if (!File.Exists(it))
                {
                    throw new BulkInsertNodeException(String.Format("The file {0} does not exist.", it));
                }
                else
                {
                    var temp = new NodeFileInfo(it);

                    temp.getHeader();
                    temp.ParseHeader();
                    nodeFileToInfo[it] = temp;
                }
            }

            var edgeFileToInfo = new Dictionary<string, EdgeFileInfo>();
            foreach (var it in edgesFileName)
            {
                if (!File.Exists(it))
                {
                    throw new BulkInsertEdgeException(String.Format("The file {0} does not exist.", it));
                }
                else
                {
                    var temp = new EdgeFileInfo(it);
                    temp.getHeader();
                    temp.ParseHeader();
                    edgeFileToInfo[it] = temp;
                }
            }
            
            var nameSpaceToNodeTableSet = new Dictionary<string, HashSet<string>>();

            //Generates node table's information
            var nodeTableToInfo = new Dictionary<string, NodeInfo>();
            foreach (var it in nodeFileToInfo)
            {
                NodeFileInfo nodeFile = it.Value;
                foreach (var iterator in nodeFile.Labels)
                {
                    NodeInfo temp;
                    if (nodeTableToInfo.ContainsKey(iterator))
                    {
                        temp = nodeTableToInfo[iterator];
                    }
                    else
                    {
                        temp = new NodeInfo();
                    }
                    
                    //Assigns properties
                    foreach (var VARIABLE in nodeFile.ColumnToType)
                    {
                        if (!temp.AddProperty(VARIABLE.Key, VARIABLE.Value))
                        {
                            throw new BulkInsertNodeException(
                                String.Format(
                                    "The label \"{0}\" contains column \"{1}\" in different types in two different file",
                                    iterator, VARIABLE.Key));
                        }
                    }

                    //Assigns user id
                    var userid = Tuple.Create(nodeFile.UserId.ToLower(), byDefaultType.ToLower());
                    if (temp.UserId != null && !(userid.Item1 == temp.UserId.Item1 || userid.Item2 == temp.UserId.Item2))
                    {
                        throw new BulkInsertNodeException(String.Format("The label \"{0}\" contains two differenct ids in two node files",
                            iterator));
                    }
                    temp.UserId = userid;

                    temp.tableName = iterator;
                    nodeTableToInfo[iterator] = temp;

                    //Updates name space dictionary
                    if (!nameSpaceToNodeTableSet.ContainsKey(nodeFile.NameSpace))
                    {
                       nameSpaceToNodeTableSet[nodeFile.NameSpace] = new HashSet<string>();
                    }
                    HashSet<string> nodeTableSet = nameSpaceToNodeTableSet[nodeFile.NameSpace];
                    if (!nodeTableSet.Contains(iterator))
                    {
                        nodeTableSet.Add(iterator);
                    }
                }
            }

            //Generates edge file's information
            foreach (var it in edgeFileToInfo)
            {
                EdgeFileInfo edgeFile = it.Value;
                HashSet<string> startNodeTable = nameSpaceToNodeTableSet[edgeFile.StartNameSpace];
                HashSet<string> endNodeTable = nameSpaceToNodeTableSet[edgeFile.EndNameSpace];

                var edge = new EdgeInfo();
                if (endNodeTable.Count > 2)
                {
                    throw new BulkInsertEdgeException("One edge cannot refer to two different node tables");
                }
                else if (endNodeTable.Count < 1)
                {
                    throw new BulkInsertEdgeException(
                        string.Format("Cannot find the namespace \"{0}\" in node files",
                            edgeFile.EndNameSpace));
                }

                foreach (var VARIABLE in endNodeTable)
                {
                    edgeFile.sinkTable = edge.Sink = VARIABLE;
                }

                foreach (var VARIABLE in edgeFile.ColumnToType)
                {
                    if (!edge.AddAtrribute(VARIABLE.Key, VARIABLE.Value))
                    {
                        throw new BulkInsertEdgeException(
                            string.Format("The Edge data file \"{0}\" contains two attributes of same name.", it.Key));
                    }
                }
                foreach (var iterator in edgeFile.Labels)
                {
                    foreach (var VARIABLE in startNodeTable)
                    {
                        if (!nodeTableToInfo[VARIABLE].AddEdge(iterator, edge))
                        {
                            throw new BulkInsertEdgeException(
                                string.Format("There exists edge type \"{0}\" conflicts on node table \"{1}\" ",
                                    iterator, VARIABLE));
                        }
                    }
                }
            }

            var transaction = Conn.BeginTransaction();
            var command = Conn.CreateCommand();
            command.Transaction = transaction;
            command.CommandTimeout = 0;
            try
            {
                //Creates node table
                foreach (var pair in nodeTableToInfo)
                {
                    CreateNodeTable(pair.Value.ToString(), transaction);

                    const string dropConstrain = @"
                    ALTER TABLE {0} DROP CONSTRAINT {1}";
                    string constrainName = "dbo" + pair.Value.tableName + "_PK_GlobalNodeId";
                    command.CommandText = string.Format(dropConstrain, pair.Value.tableName, constrainName);
                    command.ExecuteNonQuery();

                    string indexName = "dbo" + pair.Value.tableName + "_UQ_" + pair.Value.UserId.Item1;
                    command.CommandText = string.Format(dropConstrain, pair.Value.tableName, indexName);
                    command.ExecuteNonQuery();
                }

                //Bulk inserts nodes
                foreach (var pair in nodeFileToInfo)
                {
                    var nodeFile = pair.Value;
                    //Bulk insert
                    var dataColumnName = new List<string>(nodeFile.FileHeader.Count);
                    var columnDataType = new List<string>(nodeFile.FileHeader.Count);

                    using (var it = nodeFile.ColumnToType.GetEnumerator())
                    {
                        for (int i = 0; i < nodeFile.FileHeader.Count; i++)
                        {
                            if (i == nodeFile.UserIdOffset)
                            {
                                dataColumnName.Add(nodeFile.UserId);
                                columnDataType.Add(convertSqlType(byDefaultType));
                            }
                            else if (i == nodeFile.LabelOffset)
                            {
                                dataColumnName.Add("label");
                                columnDataType.Add(convertSqlType("nvarchar(4000)"));
                            }
                            else
                            {
                                if (it.MoveNext())
                                {
                                    dataColumnName.Add(it.Current.Key);
                                    columnDataType.Add(convertSqlType(it.Current.Value));
                                }
                            }
                        }
                    }
                    foreach (var it in nodeFile.Labels)
                    {
                        var tableNameWithSchema = "dbo." + it;
                        using (var sqlBulkCopy = new SqlBulkCopy(Conn, SqlBulkCopyOptions.TableLock, transaction))
                        {
                            sqlBulkCopy.BulkCopyTimeout = 0;
                            using (
                                var reader = skipScanLabel
                                    ? new BulkInsertFileDataReader(nodeFile.FileName, fieldTerminator, "\r\n",
                                        dataColumnName, columnDataType, true)
                                    : new BulkInsertFileDataReader(nodeFile.FileName, fieldTerminator, "\r\n",
                                        dataColumnName, columnDataType, true, nodeFile.LabelOffset, it))
                            {
                                foreach (var variable in dataColumnName)
                                {
                                    if (variable != "label")
                                    {
                                        sqlBulkCopy.ColumnMappings.Add(new SqlBulkCopyColumnMapping(variable, variable));
                                    }
                                }
                                sqlBulkCopy.DestinationTableName = tableNameWithSchema;
                                sqlBulkCopy.WriteToServer(reader);
                            }
                        }
                    }
                }

                //Rebuilds cluster index on node table
                foreach (var pair in nodeTableToInfo)
                {
                    const string createPrimaryKey = @"
                    ALTER TABLE {0} ADD CONSTRAINT {1} PRIMARY KEY (GlobalNodeId)";
                    string constrainName = "dbo" + pair.Value.tableName + "_PK_GlobalNodeId";
                    command.CommandText = string.Format(createPrimaryKey, pair.Value.tableName, constrainName);
                    command.ExecuteNonQuery();

                    const string dropIndex = @"
                    ALTER TABLE {0} ADD CONSTRAINT {1} UNIQUE ({2})";
                    string indexName = "dbo" + pair.Value.tableName + "_UQ_" + pair.Value.UserId.Item1;
                    command.CommandText = string.Format(dropIndex, pair.Value.tableName, indexName,
                        pair.Value.UserId.Item1);
                    command.ExecuteNonQuery();
                }

                //Bulk inserts edges
                foreach (var pair in edgeFileToInfo)
                {
                    var edgeFile = pair.Value;
                    var dataColumnName = new List<string>(edgeFile.FileHeader.Count);
                    var columnDataType = new List<string>(edgeFile.FileHeader.Count);

                    using (var it = edgeFile.ColumnToType.GetEnumerator())
                    {
                        for (int i = 0; i < edgeFile.FileHeader.Count; i++)
                        {
                            if (i == edgeFile.StartIdOffset)
                            {
                                dataColumnName.Add("startid");
                                columnDataType.Add(convertSqlType(byDefaultType));
                            }
                            else if (i == edgeFile.EndIdOffset)
                            {
                                dataColumnName.Add("endid");
                                columnDataType.Add(convertSqlType(byDefaultType));
                            }
                            if (i == edgeFile.LabelOffset)
                            {
                                dataColumnName.Add("type");
                                columnDataType.Add(convertSqlType(byDefaultType));
                            }
                            else
                            {
                                if (it.MoveNext())
                                {
                                    dataColumnName.Add(it.Current.Key);
                                    columnDataType.Add(convertSqlType(it.Current.Value));
                                }
                            }
                        }
                    }

                    HashSet<string> startNodeTable = nameSpaceToNodeTableSet[edgeFile.StartNameSpace];
                    foreach (var edgeColumnName in edgeFile.Labels)
                    {
                        //Create temp table for bulk inserting edge data
                        var randomTempTableName = "dbo." + edgeColumnName + edgeFile.sinkTable + "_" + RandomString();
                        var attributes = string.Join(",\n", edgeFile.ColumnToType.Select(x => x.Key + " " + x.Value));
                        const string createTempTable = @"
                        Create table {0}
                        (
                            startid {1},
                            endid {1},
                            {2}
                        )";

                        command.CommandText = string.Format(createTempTable, randomTempTableName, byDefaultType,
                            attributes);
                        command.ExecuteNonQuery();

                        //Bulk inset
                        using (var sqlBulkCopy = new SqlBulkCopy(Conn, SqlBulkCopyOptions.TableLock, transaction))
                        {
                            sqlBulkCopy.BulkCopyTimeout = 0;
                            using (
                                var reader = skipScanLabel
                                    ? new BulkInsertFileDataReader(edgeFile.FileName, fieldTerminator, "\r\n",
                                        dataColumnName, columnDataType, true)
                                    : new BulkInsertFileDataReader(edgeFile.FileName, fieldTerminator, "\r\n",
                                        dataColumnName, columnDataType, true, edgeFile.LabelOffset, edgeColumnName))
                            {
                                foreach (var it in dataColumnName)
                                {
                                    if (it != "type")
                                    {
                                        sqlBulkCopy.ColumnMappings.Add(it, it);
                                    }
                                }
                                sqlBulkCopy.DestinationTableName = randomTempTableName;
                                sqlBulkCopy.WriteToServer(reader);
                            }
                        }

                        //Creates clustered index on sink node in temp table
                        string clusteredIndexName = "sinkIndex_" + RandomString();
                        const string createClusteredIndex = @"
                        create clustered index [{0}] on {1}([endid])";
                        command.Parameters.Clear();
                        command.CommandText = string.Format(createClusteredIndex, clusteredIndexName,
                            randomTempTableName);
                        command.ExecuteNonQuery();

                        foreach (var sourceTableName in startNodeTable)
                        {
                            //Updates database
                            string aggregeteFunctionName = "dbo_" + sourceTableName + '_' + edgeColumnName + '_' +
                                                           "Encoder";
                            var tempStringForVariable = string.Join(", ", edgeFile.ColumnToType.Select(x => x.Key));
                            if (!string.IsNullOrEmpty(tempStringForVariable))
                            {
                                tempStringForVariable = "," + tempStringForVariable;
                            }
                            string aggregateFunction = aggregeteFunctionName + "([sinkTable].[GlobalNodeId]" +
                                                       tempStringForVariable +
                                                       ")";
                            const string updateEdgeData = @"
                            Select [{0}].globalnodeid, [GraphView_InsertEdgeInternalTable].binary, [GraphView_InsertEdgeInternalTable].sinkCount into #ParallelOptimalTempTable
                            From 
                            (
                            Select tempTable.[{2}] source, [{3}].{4} as binary, count([sinkTable].[GlobalNodeId]) as sinkCount
                            From {5} tempTable
                            Join [{3}].[{6}] sinkTable
                            On sinkTable.[{7}] = tempTable.[{8}]
                            Group by tempTable.[{2}]
                            )
                            as [GraphView_InsertEdgeInternalTable],
                            [{3}].[{0}]
                            Where [{0}].[{9}] = [GraphView_InsertEdgeInternalTable].source;
                            UPDATE [{3}].[{0}] SET {1} .WRITE(temp.[binary], null, null), {1}OutDegree += sinkCount
                            from #ParallelOptimalTempTable temp
                            where temp.globalnodeid = [{0}].globalnodeid;
                            drop table #ParallelOptimalTempTable;";
                            command.Parameters.Clear();
                            var sinkTableId = nodeTableToInfo[edgeFile.sinkTable].UserId.Item1;
                            var sourceTableId = nodeTableToInfo[sourceTableName].UserId.Item1;
                            command.CommandText = string.Format(updateEdgeData, sourceTableName, edgeColumnName,
                                "startid",
                                "dbo", aggregateFunction, randomTempTableName, edgeFile.sinkTable, sinkTableId,
                                "endid", sourceTableId);
                            command.ExecuteNonQuery();

                            const string updateReversedEdgeData = @"
                            UPDATE [{3}].[{0}] SET [InDegree] += sourceCount
                            From (
                                Select tempTable.[{1}] as Sink, count(*) as sourceCount
                                From {2} tempTable
                                Join [{5}]
                                On [{5}].[{6}] = tempTable.[{7}]
                                Group by tempTable.[{1}]
                            ) as [GraphView_InsertEdgeInternalTable]
                            Where [GraphView_InsertEdgeInternalTable].Sink = [{0}].[{4}]";
                            command.CommandText = string.Format(updateReversedEdgeData, edgeFile.sinkTable, "endid",
                                randomTempTableName, "dbo", sinkTableId, sourceTableName, sourceTableId,
                                "startid");
                            command.ExecuteNonQuery();

                            //Drops temp table 
                            const string dropTempTable = @"
                            drop table {0}";
                            command.CommandText = string.Format(dropTempTable, randomTempTableName);
                            command.ExecuteNonQuery();
                        }
                    }
                }
                transaction.Commit();
            }

            catch (Exception error)
            {
                transaction.Rollback();
                throw new BulkInsertNodeException(error.Message);
            }
        }