/* * reads one line from the given stream and splits it according to the * given separator * or returns null on EOF * field is the minimum number of fields required in the line */ public static string[] ReadLine(StreamLineNum sr, char[] sep, string empty, int field) { string[] r = null; do { string line = sr.ReadLine(); if (line == null) { return(null); } if (sep != null) { r = line.Split(sep); if (empty != null) { for (int i = 0; i < r.Length; i++) { if (r[i].Length == 0) { r[i] = empty; } } } } else { r = line.Split(sep, StringSplitOptions.RemoveEmptyEntries); } } while(r.Length == 0); if (r.Length < field) { string err = "Invalid data in input file " + sr.Fn + ", line " + sr.Line.ToString() + ": too few fields ( expected at least " + field.ToString() + ", found only " + r.Length + ")!\n"; Console.Error.WriteLine("{0}", err); throw new Exception(err); } return(r); }
public static void Main(string[] args) { string file0 = null; Dictionary <int, Tuple <string, int> > matchfiles = new Dictionary <int, Tuple <string, int> >(); char[] sep = (char[])null; string empty = null; bool only_unmatched = false; bool skip_missing = true; bool header = false; bool unique = true; bool check_fieldnum = false; int req_fields0 = 1; /* process option arguments */ for (int i = 0; i < args.Length; i++) { if (args[i].Length > 1 && args[i][0] == '-') { if (char.IsDigit(args[i][1])) { /* field to be joined */ int field; if (!Int32.TryParse(args[i].Substring(1), out field) || i + 1 == args.Length) { Console.Error.WriteLine("Invalid parameter: {0}", args[i]); break; } /* the next argument is treated as a filename regardless of a its format */ string fn = args[i + 1]; int joinfield = 1; if (i + 2 < args.Length && args[i + 2][0] != '-') { i += 2; if (!Int32.TryParse(args[i], out joinfield)) { Console.Error.WriteLine("Invalid parameter: {0} {1} {2}", args[i - 2], args[i - 1], args[i]); break; } } else { i++; } if (matchfiles.ContainsKey(field)) { Console.Error.WriteLine("Invalid parameters: join field {0} appears more than once!", field); break; } if (field > req_fields0) { req_fields0 = field; } matchfiles.Add(field, new Tuple <string, int>(fn, joinfield)); } else { switch (args[i][1]) { case 'v': only_unmatched = true; break; case 'm': skip_missing = false; break; case 't': sep = new char[1] { args[i + 1][0] }; i++; break; case 'e': empty = args[i + 1]; i++; break; case 'H': header = true; break; case 'u': unique = false; break; case 'c': check_fieldnum = true; break; case 'i': file0 = args[i + 1]; i++; break; case 'h': Console.Write("{0}", usage); return; default: Console.Error.WriteLine("Unknown parameter: {0}\n use numjoin -h for help\n"); return; } } } } /* main data structure to hold the hash tables of the files to be joined */ List <Tuple <int, Dictionary <string, string[]>, List <int> > > dicts = new List <Tuple <int, Dictionary <string, string[]>, List <int> > >(); /* dict to hold file headers if needed */ Dictionary <int, Tuple <int, string[]> > headers = new Dictionary <int, Tuple <int, string[]> >(); /* temporary aggregation so that files that potentially appear multiple times are only read once */ Dictionary <Tuple <string, int>, List <int> > matchfiles2 = new Dictionary <Tuple <string, int>, List <int> >(); foreach (var x in matchfiles) { if (matchfiles2.ContainsKey(x.Value)) { matchfiles2[x.Value].Add(x.Key); } else { matchfiles2.Add(x.Value, new List <int> { x.Key }); } } bool firstline; /* read each file, create hashtable from the contents */ foreach (var x in matchfiles2) { StreamReader sr1 = new StreamReader(x.Key.Item1); StreamLineNum s = new StreamLineNum(sr1, x.Key.Item1); int field1 = x.Key.Item2; int req_fields = field1; Dictionary <string, string[]> d = new Dictionary <string, string[]>(); if (header) { string[] header1 = ReadLine(s, sep, empty, req_fields); if (header1 == null) { Console.Error.WriteLine("No data read from file {0}!", x.Key.Item1); return; } foreach (var y in x.Value) { headers.Add(y, new Tuple <int, string[]>(req_fields, header1)); } } firstline = true; while (true) { string[] l1 = ReadLine(s, sep, empty, req_fields); if (l1 == null) { break; // end of file } if (check_fieldnum) { if (firstline) { req_fields = l1.Length; firstline = false; } else if (req_fields != l1.Length) { Console.Error.WriteLine("Inconsistent number of fields in file {0} at line {1}!", s.Fn, s.Line); return; } } string key = l1[field1 - 1]; // note: we already checked that l1 has at least field1 fields in ReadLine() if (d.ContainsKey(key)) { if (unique) { Console.Error.WriteLine("Duplicate key in file 1 ({0}): {1} on line {2}!\n", s.Fn, key, s.Line); return; } d[key] = l1; } else { d.Add(key, l1); } } sr1.Close(); dicts.Add(new Tuple <int, Dictionary <string, string[]>, List <int> >(field1, d, x.Value)); } StreamWriter sw = new StreamWriter(Console.OpenStandardOutput()); StreamReader sr = null; StreamLineNum s2 = null; if (file0 != null) { sr = new StreamReader(file0); } else { sr = new StreamReader(Console.OpenStandardInput()); } { string file01 = file0; if (file01 == null) { file01 = "<stdin>"; } s2 = new StreamLineNum(sr, file01); } char out_sep = '\t'; if (sep != null) { out_sep = sep[0]; } if (header) { /* read and write output header if requested */ string[] h2 = ReadLine(s2, sep, empty, req_fields0); if (h2 != null) { sw.Write(h2[0]); for (int i = 1; i < h2.Length; i++) { Tuple <int, string[]> h1; if (only_unmatched == false && headers.TryGetValue(i, out h1)) { int key2 = h1.Item1; string[] h1s = h1.Item2; for (int j = 0; j < h1s.Length; j++) { if (j + 1 != key2) { sw.Write(out_sep); sw.Write(h1s[j]); } } } sw.Write(out_sep); sw.Write(h2[i]); } sw.Write('\n'); } } UInt64 matched_lines = 0; UInt64 unmatched_lines = 0; firstline = true; /* temporary index to use for matched fields */ Tuple <int, string[]>[] matched = new Tuple <int, string[]> [req_fields0]; while (true) { // read one line from file0, process it string[] line2 = ReadLine(s2, sep, empty, req_fields0); if (line2 == null) { break; } if (check_fieldnum) { if (firstline) { req_fields0 = line2.Length; firstline = false; matched = new Tuple <int, string[]> [req_fields0]; } else if (req_fields0 != line2.Length) { Console.Error.WriteLine("Inconsistent number of fields in file {0} at line {1}!", s2.Fn, s2.Line); return; } } /* check that all fields to be matched are found */ bool matched_all = true; foreach (var x in dicts) { foreach (var y in x.Item3) { string key = line2[y - 1]; /* key to use for search */ string[] match; if (x.Item2.TryGetValue(key, out match)) { matched[y - 1] = new Tuple <int, string[]>(x.Item1, match); } else { matched_all = false; matched[y - 1] = null; if (skip_missing == false) { Console.Error.WriteLine("Error: key {0} from line {1}, file {2} not found in " + "match file {3}!", key, s2.Line, s2.Fn, matchfiles[y].Item1); } break; } } if (!matched_all) { break; } } /* main output of results if all were matched */ if (matched_all) { for (int i = 0; i < line2.Length; i++) { sw.Write(line2[i]); if (i < matched.Length && matched[i] != null) { int key2 = matched[i].Item1; string[] line1 = matched[i].Item2; for (int j = 0; j < line1.Length; j++) { if (j + 1 != key2) { sw.Write(out_sep); sw.Write(line1[j]); } } } if (i + 1 < line2.Length) { sw.Write(out_sep); } } sw.Write('\n'); matched_lines++; } else { if (skip_missing == false) { break; } if (only_unmatched) { /* print the original line */ sw.Write(line2[0]); for (int i = 1; i < line2.Length; i++) { sw.Write(out_sep); sw.Write(line2[i]); } sw.Write('\n'); } unmatched_lines++; } } // main loop sr.Close(); sw.Close(); // flush output Console.Error.WriteLine("Matched lines: {0}", matched_lines); if (skip_missing || only_unmatched) { Console.Error.WriteLine("Unmatched lines: {0}", unmatched_lines); } }
public static void Main(string[] args) { string file1 = null; string file2 = null; int field1 = 1; int field2 = 1; int req_fields1 = 1; int req_fields2 = 1; List <int> outfields1 = null; List <int> outfields2 = null; char[] sep = (char[])null; string empty = null; int unpaired = 0; // if 1 or 2, print unpaired lines from the given file bool only_unpaired = false; bool header = false; bool unique = true; // process option arguments int i = 0; for (; i < args.Length; i++) { if (args[i].Length > 1 && args[i][0] == '-') { switch (args[i][1]) { case '1': field1 = Int32.Parse(args[i + 1]); i++; break; case '2': field2 = Int32.Parse(args[i + 1]); i++; break; case 'j': field1 = Int32.Parse(args[i + 1]); field2 = field1; i++; break; case 't': sep = new char[1] { args[i + 1][0] }; i++; break; case 'e': empty = args[i + 1]; i++; break; case 'a': unpaired = Int32.Parse(args[i + 1]); if (!(unpaired == 1 || unpaired == 2)) { Console.Error.WriteLine("-a parameter has to be either 1 or 2\n use hashjoin -h for help\n"); return; } i++; break; case 'v': unpaired = Int32.Parse(args[i + 1]); if (!(unpaired == 1 || unpaired == 2)) { Console.Error.WriteLine("-a parameter has to be either 1 or 2\n use hashjoin -h for help\n"); return; } i++; only_unpaired = true; break; case 'o': if (args[i].Length < 3 || !(args[i][2] == '1' || args[i][2] == '2')) { Console.Error.WriteLine("Invalid parameter: {0}\n (use -o1 or -o2)\n use hashjoin -h for help\n", args[i]); return; } { List <int> tmp = new List <int>(); bool valid = true; int max = 0; // it is valid to give zero output columns from one of the files // (e.g. to filter the other file) // this case it might be necessary to give an empty string // as the argument (i.e. -o1 "") if (!(args[i + 1].Length == 0 || args[i + 1][0] == '-')) { string[] stmp = args[i + 1].Split(','); if (stmp.Length == 0) { valid = false; } foreach (string s in stmp) { int x; if (Int32.TryParse(s, out x)) { if (x < 1) { valid = false; break; } tmp.Add(x); if (x > max) { max = x; } } else { valid = false; break; } } } if (!valid) { Console.Error.WriteLine("Invalid parameter: {0} {1}\n use hashjoin -h for help\n", args[i], args[i + 1]); return; } if (args[i][2] == '1') { outfields1 = tmp; if (max > req_fields1) { req_fields1 = max; } } if (args[i][2] == '2') { outfields2 = tmp; if (max > req_fields2) { req_fields2 = max; } } } i++; break; case 'H': header = true; break; case 'u': unique = false; break; case 'h': Console.Write("{0}", usage); return; default: Console.Error.WriteLine("Unknown parameter: {0}\n use hashjoin -h for help\n"); return; } } else { break; // non-option argument, means the filenames } } // i now points to the first filename if (i + 1 >= args.Length) { Console.Error.WriteLine("Error: expecting two input filenames\n use hashjoin -h for help\n"); return; } file1 = args[i]; file2 = args[i + 1]; if (file1 == file2) { Console.Error.WriteLine("Error: input files have to be different!\n"); return; } if (field1 < 1 || field2 < 1) { Console.Error.WriteLine("Error: field numbers have to be >= 1!\n"); return; } if (field1 > req_fields1) { req_fields1 = field1; } if (field2 > req_fields2) { req_fields2 = field2; } StreamWriter sw = new StreamWriter(Console.OpenStandardOutput()); StreamReader sr1 = null; StreamReader sr2 = null; Dictionary <string, File1Line> dict = new Dictionary <string, File1Line>(); string[] file1header = null; // open input files if (file1 == "-") { sr1 = new StreamReader(Console.OpenStandardInput()); } else { sr1 = new StreamReader(file1); } if (file2 == "-") { sr2 = new StreamReader(Console.OpenStandardInput()); } else { sr2 = new StreamReader(file2); } StreamLineNum s1 = new StreamLineNum(sr1, file1); StreamLineNum s2 = new StreamLineNum(sr2, file2); char out_sep = '\t'; if (sep != null) { out_sep = sep[0]; } // read all lines from file 1 if (header) { file1header = ReadLine(s1, sep, empty, req_fields1); if (file1header == null) { Console.Error.WriteLine("Error: file 1 is empty (expected header at least!\n"); return; } } while (true) { string[] l1 = ReadLine(s1, sep, empty, req_fields1); if (l1 == null) { break; // end of file } string key = l1[field1 - 1]; // note: we already checked that l1 has at least field1 fields in ReadLine() if (dict.ContainsKey(key)) { if (unique) { Console.Error.WriteLine("Duplicate key in file 1 ({0}): {1} on line {2}!\n", file1, key, s1.Line); return; } dict[key].lines.Add(l1); } else { File1Line l2 = new File1Line(); l2.lines.Add(l1); dict.Add(key, l2); } } sr1.Close(); if (header) { // read and write output header string[] h2 = ReadLine(s2, sep, empty, req_fields2); string[] h1 = file1header; if (h1 != null && h2 != null) { bool firstout = true; WriteFields(sw, h1, outfields1, ref firstout, out_sep); WriteFields(sw, h2, outfields2, ref firstout, out_sep); sw.Write('\n'); } } UInt64 out_lines = 0; UInt64 matched1 = 0; UInt64 matched2 = 0; UInt64 unmatched = 0; while (true) { // read one line from file 2, process it string[] line2 = ReadLine(s2, sep, empty, req_fields2); if (line2 == null) { break; } string key = line2[field2 - 1]; if (dict.ContainsKey(key)) { File1Line match = dict[key]; if (!only_unpaired) { if (!match.seen) { matched1 += (UInt64)match.lines.Count; } foreach (string[] line1 in match.lines) { bool firstout = true; // write out fields from the first file WriteFields(sw, line1, outfields1, ref firstout, out_sep); WriteFields(sw, line2, outfields2, ref firstout, out_sep); sw.Write('\n'); out_lines++; } } match.seen = true; matched2++; } else if (unpaired == 2) { // still print unpaired lines from file 2 bool firstout = true; // note: we write empty fields for file 1 if (outfields1 != null) { WriteFields(sw, null, outfields1, ref firstout, out_sep); } WriteFields(sw, line2, outfields2, ref firstout, out_sep); sw.Write('\n'); out_lines++; unmatched++; } } // main loop sr2.Close(); // write out unmatched lines from file 1 if needed if (unpaired == 1) { foreach (File1Line x in dict.Values) { if (x.seen == false) { foreach (string[] line1 in x.lines) { // still print unpaired lines from file 1 bool firstout = true; WriteFields(sw, line1, outfields1, ref firstout, out_sep); // note: we write empty fields for file 2 if (outfields2 != null) { WriteFields(sw, null, outfields2, ref firstout, out_sep); } sw.Write('\n'); out_lines++; unmatched++; } } } } sw.Close(); // flush output Console.Error.WriteLine("Matched lines from file 1: {0}", matched1); Console.Error.WriteLine("Matched lines from file 2: {0}", matched2); if (unmatched > 0) { switch (unpaired) { case 1: Console.Error.WriteLine("Unmatched lines from file 1: {0}", unmatched); break; case 2: Console.Error.WriteLine("Unmatched lines from file 2: {0}", unmatched); break; } } Console.Error.WriteLine("Total lines output: {0}", out_lines); }