public CrawlDocument GetNewRandomUnscannedDocument() { QueryDocument filter = new QueryDocument(); filter.Add("Scanned", false); filter.Add("ClassName", "File"); // Not efficient to obtain all collection, but 'files' cooolection shouldn't bee too large // http://stackoverflow.com/questions/3975290/produce-a-random-number-in-a-range-using-c-sharp Random r = new Random((int)DateTime.Now.Ticks); long num = files.Find(filter).Count(); int x = r.Next((int)num);//Max range var allFiles = files.Find(filter).SetSkip(x).SetLimit(1); foreach (var file in allFiles) { CrawlDocument result = new CrawlDocument(); result.ClassName = "File"; result.FileId = file["FileId"].ToString(); result.Hash = file["Hash"].ToString(); result.Path = file["Path"].ToString(); result.Scanned = file["Scanned"].ToBoolean(); // Check db size is close to maximum // FileInfo Fi = new FileInfo(dbPath); // long maxsize = 2000*1024*1024; // if (Fi.Length > maxsize) // return null; return(result); } return(null); }
public List <CrawlDocument> GetFile(string fileId) { List <CrawlDocument> resultList = new List <CrawlDocument>(); var files = DatabaseMongo.GetCollection <BsonDocument>("files"); QueryDocument filter = new QueryDocument(); filter.Add("FileId", fileId); filter.Add("ClassName", "File"); var allFiles = files.Find(filter); foreach (BsonDocument file in allFiles) { CrawlDocument result = new CrawlDocument(); result.FileId = file["FileId"].ToString(); result.Hash = file["Hash"].ToString(); result.Path = file["Path"].ToString(); result.Scanned = file["Scanned"].ToBoolean(); resultList.Add(result); } return(resultList); }
public void TestGetNewRandomUnscannedDocument() { db.Clear(); int numRecords = 1000; Stopwatch timer = new Stopwatch(); for (int i = 0; i < numRecords; i++) { CrawlDocument cd = new CrawlDocument(); cd.Hash = Guid.NewGuid().ToString(); cd.Path = i.ToString(); cd.ClassName = "File"; db.InsertIntoFiles(cd); } CrawlDocument cd1 = db.GetNewRandomUnscannedDocument(); CrawlDocument cd2 = db.GetNewRandomUnscannedDocument(); // Random-selected files from 1000 records should differ Assert.IsFalse(cd1.FileId == cd2.FileId); timer.Stop(); // Assume 3ms for each record should be enough Assert.IsTrue(timer.ElapsedMilliseconds < 3 * numRecords); }
static void Scan(string dir, string dbName) { //Открыть папку, выбрать все файлы двг из нее string dataDir = @"c:\Data\"; string[] dwgFiles = Directory.GetFiles(dir, "*.dwg", SearchOption.AllDirectories); DbMongo db = new DbMongo(dbName); foreach (string dwgFile in dwgFiles) { CrawlDocument cDoc = new CrawlDocument(dwgFile); FileCopy(dwgFile, Path.Combine(dataDir, cDoc.FileId + ".dwg")); db.InsertIntoFiles(cDoc); } /* * //Запуситить процессы по числу ядер процессоров каждый на своем ядре * int numCores = 4; * for (int i = 0; i < numCores; i++) * { * //crawlinNano(); * //http://cplus.about.com/od/learnc/a/multi-threading-using-task-parallel-library.htm * Task.Factory.StartNew(() => crawlinNano()); * //Процесс выбирает из базы случайным образом непросканированный файл и сканирует его в Json * //Это пока выполняется вручным запуском нанокадов * //Если файл изменился, то записывается его новый hash * } */ }
public void InsertIntoFiles(CrawlDocument crawlDocument) { BsonDocument doc = crawlDocument.ToBsonDocument(); var filter = new QueryDocument("Hash", crawlDocument.Hash); var qryResult = files.FindOne(filter); // if hash exist - we should skip insertion if (qryResult == null) { // Check hash already exists, if no - insert files.Insert(doc); } }
public static void Crawl(bool closeAfterComplete = true) { DbMongo sqlDB = new DbMongo("SingleFile"); //While Get random dwg from database that not scanned CrawlDocument crawlDoc = sqlDB.GetNewRandomUnscannedDocument(); while (crawlDoc != null) { crawlAcDbDocument cDoc = new crawlAcDbDocument(crawlDoc); cDoc.sqlDB = sqlDB; cDoc.ScanDocument(); crawlDoc = sqlDB.GetNewRandomUnscannedDocument(); } if (closeAfterComplete) { HostMgd.ApplicationServices.Application.Quit(); } }
public void TestInsertIntoFiles() { db.Clear(); string json1 = @" { 'ClassName': 'File', 'FileId': 'bc6a1669-51ce-444c-94c6-cfec71c0f44d', 'Hash': 'd520b80512f226e81dd72294037657fd', 'Path': '\\\\FILESERVER\\home\\#АРХИВ 2014\\Объекты\\МНОГОТОПЛИВНАЯ АЗС №15\\задание на фундаменты.dwg', 'Scanned': false, '_id': { '$oid': '55a49dfff80dc7180c8228d3' } }"; string json2 = @" { 'ClassName': 'File', 'FileId': '9e2769ff-678f-401b-8d10-e0581aa6bf98', 'Hash': '253ffb6063333c5bfc1109c5d7db1945', 'Path': '\\\\FILESERVER\\home\\#АРХИВ 2014\\Объекты\\МНОГОТОПЛИВНАЯ АЗС №15\\образец исх данные.dwg', 'Scanned': false, '_id': { '$oid': '55a49dfff80dc7180c8228d4' } } "; db.InsertIntoFiles(json1); db.InsertIntoFiles(json2); Assert.IsTrue(db.HasFileHash("d520b80512f226e81dd72294037657fd")); Assert.IsTrue(db.HasFileId("bc6a1669-51ce-444c-94c6-cfec71c0f44d")); Assert.IsTrue(db.HasFileHash("253ffb6063333c5bfc1109c5d7db1945")); Assert.IsTrue(db.HasFileId("9e2769ff-678f-401b-8d10-e0581aa6bf98")); db.Clear(); Crawl.CrawlDocument cdoc = new CrawlDocument(@"D:\Documents\Desktop\SingleFile\+b3826065-07d1-4d4a-8af4-35ebc3630117.dwg"); db.InsertIntoFiles(cdoc); Assert.IsTrue(db.HasFileHash(cdoc.Hash)); Assert.IsTrue(db.HasFileId(cdoc.FileId)); }
private List <CrawlDocument> GetXrefs(Document aDoc) { //http://adndevblog.typepad.com/autocad/2012/06/finding-all-xrefs-in-the-current-database-using-cnet.html XrefGraph xGraph = aDoc.Database.GetHostDwgXrefGraph(false); int numXrefs = xGraph.NumNodes; List <CrawlDocument> result = new List <CrawlDocument>(); for (int i = 0; i < numXrefs; i++) { XrefGraphNode xrefNode = xGraph.GetXrefNode(i); if (xrefNode.XrefStatus == XrefStatus.Resolved) { //Document theDoc = TeighaApp.DocumentManager.GetDocument(xrefNode.Database); CrawlDocument acDoc = new CrawlDocument(xrefNode.Database.Filename); result.Add(acDoc); } } return(result); }
public crawlAcDbDocument(CrawlDocument crawlDoc) { this.FullPath = crawlDoc.Path; this.FileId = crawlDoc.FileId; this.teighaDocument = TeighaApp.DocumentManager.Open(Path.Combine(_dataDir, crawlDoc.FileId + ".dwg")); }