public IEnumerable <string> GetData(IEnumerable <string> data) { int count = 0; List <string> res = new List <string>(); var ipDecryptor = new IPAddressDecryptor(keyFileName); var reader = new BondReader <UETLog>(); var tagIdNameMap = new TagIdNameMap(); var analyticsGuidExtractor = new AnalyticsGuidExtractor(); foreach (var line in data) { var eqs = new EnumeratedQueryString(); count++; var uetLogByte = Convert.FromBase64String(line.Split('\t')[0]); UETLog log; UETLogView vSchema = new UETLogView(); if (!reader.TryParse(uetLogByte, out log)) { res.Add(string.Empty); continue; } if (!eqs.TryParse(log.QueryString)) { res.Add(string.Empty); continue; } vSchema.ReferrerURL = eqs.ReferrerURL; if (String.IsNullOrWhiteSpace(vSchema.ReferrerURL)) { vSchema.ReferrerURL = log.ReferrerURL; } vSchema.TagId = eqs.TagId; vSchema.TagName = eqs.TagName; if (String.IsNullOrWhiteSpace(eqs.AppInstallClickId)) { if (String.IsNullOrWhiteSpace(vSchema.ReferrerURL) || log.ClientIP == null || (log.ClientIP.EncryptedIP == null && log.ClientIP.EncryptedIPv6 == null) || (vSchema.TagId <= 0 && String.IsNullOrWhiteSpace(vSchema.TagName))) { res.Add(string.Empty); continue; } if (vSchema.TagId <= 0) { if (!eqs.AdvertiserId.HasValue) { res.Add(string.Empty); continue; } Dictionary <int, int> customerIdToTagId; if (!tagIdNameMap.NameToIdMap.TryGetValue(vSchema.TagName, out customerIdToTagId)) { res.Add(string.Empty); continue; } if (!customerIdToTagId.TryGetValue(eqs.AdvertiserId.Value, out vSchema.TagId)) { res.Add(string.Empty); continue; } } if (!CommonUtils.IsNewUETTagId(vSchema.TagId)) { res.Add(string.Empty); continue; } if (String.IsNullOrWhiteSpace(vSchema.TagName)) { if (!tagIdNameMap.IdToNameMap.TryGetValue(vSchema.TagId, out vSchema.TagName)) { vSchema.TagName = string.Empty; } } } vSchema.ANID = CommonUtils.ParseGuid(log.ANID); vSchema.ClientIP = log.ClientIP; vSchema.EventDateTime = CommonUtils.FromUtcUnixTimeToTicks(log.EventDateTime); vSchema.IsNewMUID = log.IsNewMUID; vSchema.LogServerName = log.LogServerName; vSchema.MUID = CommonUtils.ParseGuid(log.MUID); vSchema.QueryString = log.QueryString; vSchema.UserAgent = log.UserAgent; vSchema.AppInstallClickId = eqs.AppInstallClickId; vSchema.PageLoad = eqs.PageLoad; vSchema.PageTitle = eqs.PageTitle; vSchema.UETMatchingGuid = eqs.UETMatchingGuid; vSchema.Version = eqs.Version; vSchema.NavigatedFromURL = eqs.NavigatedFromURL; if (String.IsNullOrWhiteSpace(vSchema.NavigatedFromURL) && eqs.iframe) { vSchema.NavigatedFromURL = log.ReferrerURL; } CustomEvent customEvent = null; if (String.Equals(eqs.EventType, "custom", StringComparison.OrdinalIgnoreCase)) { customEvent = new CustomEvent { EventCategory = eqs.EventCategory, EventLabel = eqs.EventLabel, EventAction = eqs.EventAction, EventValue = eqs.EventValue }; } vSchema.customEvent = customEvent; vSchema.EventType = eqs.EventType == null ? null : eqs.EventType.ToLower(); vSchema.GoalValue = eqs.GoalValue; Guid?analyticsGuid = null; if (!String.IsNullOrWhiteSpace(log.AnalyticsCookie)) { analyticsGuidExtractor.TryExtractAnalyticsGuid(log.AnalyticsCookie, out analyticsGuid); } vSchema.AnalyticsGuid = analyticsGuid; string ip = null; if (log.ClientIP != null && ipDecryptor != null) { ip = DecryptIp(log.ClientIP, ipDecryptor); } vSchema.IP = string.IsNullOrWhiteSpace(ip) ? "hidden" : ip; if (String.IsNullOrWhiteSpace(ip) && log.ClientIP != null) { ip = String.IsNullOrWhiteSpace(log.ClientIP.EncryptedIPv6) ? log.ClientIP.EncryptedIP : log.ClientIP.EncryptedIPv6; } vSchema.UAIPId = !String.IsNullOrWhiteSpace(ip) ? CommonUtils.GetGuidFromIPUserAgent(ip, log.UserAgent) : Guid.Empty; // Set dedup key for UET Log // If there is mid and rn, and IsNewMUID is false, we still dedup on mid, rn and MUID. // If there is mid and rn, and IsNewMUID is true, we will only dedup on mid and rn. // If there is no mid or rn, we’ll always dedup on timestamp and MUID. string dedupKey = string.Empty; if (eqs.UETMatchingGuid.HasValue && !String.IsNullOrWhiteSpace(eqs.rn)) { dedupKey = eqs.UETMatchingGuid.Value.ToString("N") + "-" + eqs.rn; if (log.IsNewMUID == false && !String.IsNullOrEmpty(log.MUID)) { dedupKey += "-" + log.MUID; } } else { dedupKey = log.EventDateTime.ToString(); if (!String.IsNullOrEmpty(log.MUID)) { dedupKey += "-" + log.MUID; } } vSchema.DedupKey = dedupKey; res.Add(UETLogView.Serialize(vSchema)); } return(res); }
//private static RDD<string> getUserVisit static void Main(string[] args) { string filepath = @"hdfs:///common/vistizationData/"; var OutputPath = @"hdfs:///user/t-zhuxia/vistizationRes/"; string uetLogPath = filepath + "gat_20160902_0600.csv"; var UICLogPath = filepath + "uic_20160902_0600.csv"; string AnidPath = filepath + "ANID_20160831.csv"; string MuidPath = filepath + "MUID_20160831.csv"; var Visitization_AppInstall_Output = OutputPath + "Visitization_AppInstall_20160902_00"; var NewEscrowFile = OutputPath + "NewEscrowCandidates_20160902"; SparkConf conf = (new SparkConf()).SetAppName("VisitizationStreaming"); SparkContext sc = new SparkContext(conf); RDD <string> rawUetLogs = getDataFromFile(sc, uetLogPath); var uetLogs = getUETLogs(rawUetLogs); var uetLogsKeyValpair = uetLogs.Map(line => { if (!string.IsNullOrEmpty(line)) { UETLogView data = UETLogView.Deserialize(line); string key = data.DedupKey + "," + data.ANID + "," + data.IsNewMUID + "," + data.UAIPId + "," + data.ReferrerURL + "," + data.QueryString + "," + data.AnalyticsGuid; return(new KeyValuePair <string, string>(key, line)); } return(new KeyValuePair <string, string>(null, null)); }); uetLogs = uetLogsKeyValpair.ReduceByKey((x, y) => { if (!string.IsNullOrEmpty(x) && !string.IsNullOrEmpty(y)) { return(x + delimeter + y); } if (!string.IsNullOrEmpty(x)) { return(x); } if (!string.IsNullOrEmpty(y)) { return(y); } return(null); }).Map <string>(UETLogDedupReducer.INSTANCE.GetData).Filter(line => !string.IsNullOrEmpty(line)); /*****************************************to do after this ****************************************************/ var uetLogs_PageVisit = uetLogs.Filter(line => { UETLogView data = UETLogView.Deserialize(line); return(string.IsNullOrEmpty(data.AppInstallClickId)); }); Console.Out.WriteLine("----------------uetLogs_PageVisitCount: " + uetLogs_PageVisit.Count()); var uetLogs_AppInstall = uetLogs.Filter(line => { UETLogView data = UETLogView.Deserialize(line); return(!string.IsNullOrEmpty(data.AppInstallClickId)); }); RDD <string> appInstallVisits = uetLogs_AppInstall.Map <string>(AppInstallProcessor.INSTANCE.GetData); Console.Out.WriteLine("----------------appInstallVisitsCount: " + appInstallVisits.Count()); //appInstallVisits.Repartition(1).SaveAsTextFile(Visitization_AppInstall_Output); //----- Get UIC log var uicRaw = getDataFromFile(sc, UICLogPath); var UserIdConverage = getUICData(uicRaw); //----- Join uetlog with uic log var uetColumns = uetLogs_PageVisit.Map(line => { var uetLog = UETLogView.Deserialize(line); return(new KeyValuePair <Guid?, string>(uetLog.UETMatchingGuid, line)); }); var uicColumns = UserIdConverage.Map(line => { var uic = UserIdCoverageShcema.Deserialize(line); return(new KeyValuePair <Guid?, Guid?>(uic.UETMatchingGuid, uic.AnalyticsGuid)); }); var UETLogProcessedEntriesPageVisit = uetColumns.LeftOuterJoin(uicColumns).Map(line => { var value = UETLogView.Deserialize(line.Value.Item1); if (line.Value.Item2.IsDefined) { var agid = line.Value.Item2.GetValue(); if (agid != null) { value.AnalyticsGuid = agid; } value.DedupKey = null; value.QueryString = null; } return(UETLogView.Serialize(value)); }); var visitsForUsersKeyValuePair = UETLogProcessedEntriesPageVisit.Map(line => { var value = UETLogView.Deserialize(line); var key = value.UAIPId.ToString() + "," + value.TagId.ToString(); return(new KeyValuePair <string, string>(key, line)); }).ReduceByKey((x, y) => { return(x + delimeter + y); }); var visitsForUsers = visitsForUsersKeyValuePair.FlatMap <string>(line => { return(VisitizeReducer.INSTANCE.GetData(line)); }); // Step 7: First field to fill is UserIdType and build the general "UETUserId", by default it is UAIPID during the construction of SAEventConversionFacts. // Step 7.1: Build the TypeOfUser field. // The way of deciding the TypeOfUser is: // 1. If MUID is not NULL and IsNewMUID is false, UserIdType is MUID (TypeOfUser 2), later will join with UMS MUID view. // 2. If MUID is NULL but ANID is not, UserIdType is ANID (TypeOfUser 1), ater will join with UMS ANID view. // 3. If both MUID and ANID are NULL, but AnalyticsGuid is nut NULL, UserIdType is AnalyticsGuid (TypeOfUser 3) // 4. If AnalyticsGuid is also NULL, UserIdType is Unknown (TypeOfUser -1) var VisitForUserWithTypeOfUser = getVisitsForUsersWithTypeOfUser(visitsForUsers); // Step 7.2: Get the ANID and MUID sub-table out of the VisitsForUsers_WithTypeOfUser because we need to update // the ANID/MUID to StableIdValue according to UMS mapping var VisitsForUsers_WithTypeOfUser_ANID = VisitForUserWithTypeOfUser.Filter(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.TypeOfUser == 1); }); var VisitsForUsers_WithTypeOfUser_MUID = VisitForUserWithTypeOfUser.Filter(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.TypeOfUser == 2); }); // Step 7.3: Buid the UMS ANID/MUID view from "/shares/adCenter.BICore.SubjectArea/SubjectArea/Conversion/UMS/ANID_{yyyyMMdd}.ss(12.43GB)/MUID_{yyyyMMdd}.ss(166.66GB)" var UMS_ANIDData = getDataFromFile(sc, AnidPath); var UMS_MUIDData = getDataFromFile(sc, MuidPath); // Step 7.4: Join VisitsForUsers_WithTypeOfUser_ANID(MUID) with UMS_ANID(MUID)_MappingFile to get to use the StableIdValue. var VisitsForUsers_WithStableIdANIDGuid = VisitsForUsers_WithTypeOfUser_ANID.Map(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.ANID); }); Console.Out.WriteLine("----------------VisitsForUsers_WithStableIdANIDGuid: " + VisitsForUsers_WithStableIdANIDGuid.Count()); var VisitsForUsers_WithStableIdMUIDGuid = VisitsForUsers_WithTypeOfUser_MUID.Map(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.MUID); }); Console.Out.WriteLine("----------------VisitsForUsers_WithStableIdMUIDGuid: " + VisitsForUsers_WithStableIdMUIDGuid.Count()); var anid = getUMS_ANIDData(UMS_ANIDData).Map <KeyValuePair <Guid?, SerializaType> >(line => { var an = line.DeserializeObject <UMS_ANID>(); return(new KeyValuePair <Guid?, SerializaType>(an.ANID, line)); }).FlatMap <KeyValuePair <Guid?, SerializaType> >(new BroadcastJoinWrapper(VisitsForUsers_WithStableIdANIDGuid, sc).Filter); var muid = getUMS_MUIDData(UMS_MUIDData).Map <KeyValuePair <Guid?, SerializaType> >(line => { var an = line.DeserializeObject <UMS_MUID>(); return(new KeyValuePair <Guid?, SerializaType>(an.MUID, line)); }).FlatMap <KeyValuePair <Guid?, SerializaType> >(new BroadcastJoinWrapper(VisitsForUsers_WithStableIdMUIDGuid, sc).Filter); var VisitsForUsers_WithStableIdFromANID = VisitsForUsers_WithTypeOfUser_ANID.Map(line => { VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(new KeyValuePair <Guid?, SerializaType>(data.ANID, line)); }).LeftOuterJoin(anid).Map(line => { VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line.Value.Item1); var VA = new VisitsForUsersWithStableIdFromID(); VA.UAIPId = data.UAIPId; VA.TagId = data.TagId; VA.TagName = data.TagName; VA.AnalyticsGuid = data.AnalyticsGuid; VA.SAEventConversionFactsRow = data.SAEventConversionFactsRow; if (line.Value.Item2.IsDefined) { var an = line.Value.Item2.GetValue().DeserializeObject <UMS_ANID>(); VA.StableId = an.ANID; } else { VA.StableId = data.ANID; } return(VA.SerializeObject()); }); var VisitsForUsers_WithStableIdFromMUID = VisitsForUsers_WithTypeOfUser_MUID.Map(line => { VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(new KeyValuePair <Guid?, SerializaType>(data.MUID, line)); }).LeftOuterJoin(muid).Map(line => { VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line.Value.Item1); var VA = new VisitsForUsersWithStableIdFromID(); VA.UAIPId = data.UAIPId; VA.TagId = data.TagId; VA.TagName = data.TagName; VA.AnalyticsGuid = data.AnalyticsGuid; VA.SAEventConversionFactsRow = data.SAEventConversionFactsRow; if (line.Value.Item2.IsDefined) { var an = line.Value.Item2.GetValue().DeserializeObject <UMS_MUID>(); VA.StableId = an.MUID; } else { VA.StableId = data.MUID; } return(VA.SerializeObject()); }); Console.WriteLine("-----------------VisitsForUsers_WithStableIdFromANID: " + VisitsForUsers_WithStableIdFromANID.Count()); Console.WriteLine("-----------------VisitsForUsers_WithStableIdFromMUID: " + VisitsForUsers_WithStableIdFromMUID.Count()); // Step 7.5: Select the UETUserId from the StableId and add the UserType according to whether it is from ANID or MUID var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part1 = VisitsForUsers_WithStableIdFromANID.Map(line => { var VA = line.DeserializeObject <VisitsForUsersWithStableIdFromID>(); VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart(); data.UETUserId = VA.StableId; data.TypeOfUser = UserType.A; data.UAIPId = VA.UAIPId; data.TagId = VA.TagId; data.TagName = VA.TagName; data.AnalyticsGuid = VA.AnalyticsGuid; data.SAEventConversionFactsRow = VA.SAEventConversionFactsRow; return(data.SerializeObject()); }); var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part2 = VisitsForUsers_WithStableIdFromMUID.Map(line => { var VA = line.DeserializeObject <VisitsForUsersWithStableIdFromID>(); VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart(); data.UETUserId = VA.StableId; data.TypeOfUser = UserType.M; data.UAIPId = VA.UAIPId; data.TagId = VA.TagId; data.TagName = VA.TagName; data.AnalyticsGuid = VA.AnalyticsGuid; data.SAEventConversionFactsRow = VA.SAEventConversionFactsRow; return(data.SerializeObject()); }); var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part = VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part2.Union(VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part1); // Step 7.6: For the AnalyticsGuid sub-table of the VisitsForUsers_WithTypeOfUser, use AnalyticsGuid as the UETUserId and "AG" as the UserType. var VisitsForUsers_WithUETUserId_AnalyticsGuid_Other_UNION_Part = VisitForUserWithTypeOfUser.Filter(line => { var data = VisitsForUser_WithTypeOfUser.Deserialize(line); return(data.TypeOfUser == 3 || data.TypeOfUser == -1); }).Map(line => { var Visits = VisitsForUser_WithTypeOfUser.Deserialize(line); VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart(); data.UAIPId = Visits.UAIPId; data.TagId = Visits.TagId; data.TagName = Visits.TagName; data.AnalyticsGuid = Visits.AnalyticsGuid; data.SAEventConversionFactsRow = Visits.SAEventConversionFactsRow; if (Visits.TypeOfUser == 3) { data.UETUserId = Visits.AnalyticsGuid; data.TypeOfUser = UserType.AG; } else { data.UETUserId = Visits.UAIPId; data.TypeOfUser = UserType.UA; } return(data.SerializeObject()); }); // Step 7.7: Union result from 7.5 and 7.6 var VisitsForUsers_WithUETUserId = VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part.Union(VisitsForUsers_WithUETUserId_AnalyticsGuid_Other_UNION_Part); // Step 7.8: Reduce on UETUserId, UAIPId, TagId, using UserCombineReducer VisitsForUsers_WithUETUserId = VisitsForUsers_WithUETUserId.Map(line => { var data = line.DeserializeObject <VisitsForUsersWithUETUserIdMUIDANIDPart>(); return(new VisitsForUsersWithUETUserId(data, data.SAEventConversionFactsRow.Visits[0].Events[0].EventDateTime).SerializeObject()); }); var VisitsForUsers_Current = VisitsForUsers_WithUETUserId .Map(line => { var data = line.DeserializeObject <VisitsForUsersWithUETUserId>(); return(new KeyValuePair <long, string>(data.EventDateTime, line)); }) .SortByKey() .Map(line => { var data = line.Value.DeserializeObject <VisitsForUsersWithUETUserId>(); var key = string.Format("{0},{1},{2}", data.UETUserId, data.UAIPId, data.TagId); return(new KeyValuePair <string, string>(key, line.Value)); }) .ReduceByKey((x, y) => { if (!string.IsNullOrEmpty(x) && !string.IsNullOrEmpty(y)) { return(x + delimeter + y); } if (!string.IsNullOrEmpty(x)) { return(x); } if (!string.IsNullOrEmpty(y)) { return(y); } return(null); }).Map <SerializaType>(UserCombineReducer.INSTANCE.getData); // Step 8: Handle the current hour result with Escrow visits from the previous hour: //As EscrowFile doesn't exists, so skip this step // Step 9: Calculate conversions for each visit using GoalConversionProcessor and output it. var VisitsWithConversions = VisitsForUsers_Current.MapPartitions(GoalConversionProcessor.INSTANCE.getData); // Step 10: Update the Escrow file var VisitsWithConversions_notUAIP = VisitsWithConversions.Filter(line => { var data = line.DeserializeObject <VisitsWithConversion>(); return(data.SAEventConversionFactsRow.UserIdType != UETUserIdType.UAIPID); }); var NewEscrowCandidates = VisitsWithConversions_notUAIP.MapPartitions(EscrowCandidateProcessor.INSTANCE.getData); // Step 10.2: Output the result to the new escrow file NewEscrowCandidates.Repartition(1).SaveAsTextFile(NewEscrowFile); return; }