Example #1
0
        /*
         * reads one line from the given stream and splits it according to the
         *  given separator
         * or returns null on EOF
         * field is the minimum number of fields required in the line
         */
        public static string[] ReadLine(StreamLineNum sr, char[] sep, string empty,
                                        int field)
        {
            string[] r = null;
            do
            {
                string line = sr.ReadLine();
                if (line == null)
                {
                    return(null);
                }
                if (sep != null)
                {
                    r = line.Split(sep);
                    if (empty != null)
                    {
                        for (int i = 0; i < r.Length; i++)
                        {
                            if (r[i].Length == 0)
                            {
                                r[i] = empty;
                            }
                        }
                    }
                }
                else
                {
                    r = line.Split(sep, StringSplitOptions.RemoveEmptyEntries);
                }
            } while(r.Length == 0);

            if (r.Length < field)
            {
                string err = "Invalid data in input file " + sr.Fn + ", line " +
                             sr.Line.ToString() + ": too few fields ( expected at least "
                             + field.ToString() + ", found only " + r.Length + ")!\n";
                Console.Error.WriteLine("{0}", err);
                throw new Exception(err);
            }
            return(r);
        }
Example #2
0
        public static void Main(string[] args)
        {
            string file0 = null;
            Dictionary <int, Tuple <string, int> > matchfiles = new Dictionary <int, Tuple <string, int> >();

            char[] sep   = (char[])null;
            string empty = null;

            bool only_unmatched = false;
            bool skip_missing   = true;
            bool header         = false;
            bool unique         = true;
            bool check_fieldnum = false;
            int  req_fields0    = 1;

            /* process option arguments */
            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Length > 1 && args[i][0] == '-')
                {
                    if (char.IsDigit(args[i][1]))
                    {
                        /* field to be joined */
                        int field;
                        if (!Int32.TryParse(args[i].Substring(1), out field) || i + 1 == args.Length)
                        {
                            Console.Error.WriteLine("Invalid parameter: {0}", args[i]);
                            break;
                        }
                        /* the next argument is treated as a filename regardless of a its format */
                        string fn        = args[i + 1];
                        int    joinfield = 1;
                        if (i + 2 < args.Length && args[i + 2][0] != '-')
                        {
                            i += 2;
                            if (!Int32.TryParse(args[i], out joinfield))
                            {
                                Console.Error.WriteLine("Invalid parameter: {0} {1} {2}", args[i - 2], args[i - 1], args[i]);
                                break;
                            }
                        }
                        else
                        {
                            i++;
                        }
                        if (matchfiles.ContainsKey(field))
                        {
                            Console.Error.WriteLine("Invalid parameters: join field {0} appears more than once!", field);
                            break;
                        }
                        if (field > req_fields0)
                        {
                            req_fields0 = field;
                        }
                        matchfiles.Add(field, new Tuple <string, int>(fn, joinfield));
                    }
                    else
                    {
                        switch (args[i][1])
                        {
                        case 'v':
                            only_unmatched = true;
                            break;

                        case 'm':
                            skip_missing = false;
                            break;

                        case 't':
                            sep = new char[1] {
                                args[i + 1][0]
                            };
                            i++;
                            break;

                        case 'e':
                            empty = args[i + 1];
                            i++;
                            break;

                        case 'H':
                            header = true;
                            break;

                        case 'u':
                            unique = false;
                            break;

                        case 'c':
                            check_fieldnum = true;
                            break;

                        case 'i':
                            file0 = args[i + 1];
                            i++;
                            break;

                        case 'h':
                            Console.Write("{0}", usage);
                            return;

                        default:
                            Console.Error.WriteLine("Unknown parameter: {0}\n  use numjoin -h for help\n");
                            return;
                        }
                    }
                }
            }

            /* main data structure to hold the hash tables of the files to be joined */
            List <Tuple <int, Dictionary <string, string[]>, List <int> > > dicts =
                new List <Tuple <int, Dictionary <string, string[]>, List <int> > >();
            /* dict to hold file headers if needed */
            Dictionary <int, Tuple <int, string[]> > headers = new Dictionary <int, Tuple <int, string[]> >();
            /* temporary aggregation so that files that potentially appear multiple times are only read once */
            Dictionary <Tuple <string, int>, List <int> > matchfiles2 = new Dictionary <Tuple <string, int>, List <int> >();

            foreach (var x in matchfiles)
            {
                if (matchfiles2.ContainsKey(x.Value))
                {
                    matchfiles2[x.Value].Add(x.Key);
                }
                else
                {
                    matchfiles2.Add(x.Value, new List <int> {
                        x.Key
                    });
                }
            }
            bool firstline;

            /* read each file, create hashtable from the contents */
            foreach (var x in matchfiles2)
            {
                StreamReader  sr1               = new StreamReader(x.Key.Item1);
                StreamLineNum s                 = new StreamLineNum(sr1, x.Key.Item1);
                int           field1            = x.Key.Item2;
                int           req_fields        = field1;
                Dictionary <string, string[]> d = new Dictionary <string, string[]>();
                if (header)
                {
                    string[] header1 = ReadLine(s, sep, empty, req_fields);
                    if (header1 == null)
                    {
                        Console.Error.WriteLine("No data read from file {0}!", x.Key.Item1);
                        return;
                    }
                    foreach (var y in x.Value)
                    {
                        headers.Add(y, new Tuple <int, string[]>(req_fields, header1));
                    }
                }
                firstline = true;
                while (true)
                {
                    string[] l1 = ReadLine(s, sep, empty, req_fields);
                    if (l1 == null)
                    {
                        break;                                // end of file
                    }
                    if (check_fieldnum)
                    {
                        if (firstline)
                        {
                            req_fields = l1.Length; firstline = false;
                        }
                        else if (req_fields != l1.Length)
                        {
                            Console.Error.WriteLine("Inconsistent number of fields in file {0} at line {1}!", s.Fn, s.Line);
                            return;
                        }
                    }
                    string key = l1[field1 - 1];                   // note: we already checked that l1 has at least field1 fields in ReadLine()
                    if (d.ContainsKey(key))
                    {
                        if (unique)
                        {
                            Console.Error.WriteLine("Duplicate key in file 1 ({0}): {1} on line {2}!\n", s.Fn, key, s.Line); return;
                        }
                        d[key] = l1;
                    }
                    else
                    {
                        d.Add(key, l1);
                    }
                }
                sr1.Close();

                dicts.Add(new Tuple <int, Dictionary <string, string[]>, List <int> >(field1, d, x.Value));
            }

            StreamWriter  sw = new StreamWriter(Console.OpenStandardOutput());
            StreamReader  sr = null;
            StreamLineNum s2 = null;

            if (file0 != null)
            {
                sr = new StreamReader(file0);
            }
            else
            {
                sr = new StreamReader(Console.OpenStandardInput());
            }
            {
                string file01 = file0;
                if (file01 == null)
                {
                    file01 = "<stdin>";
                }
                s2 = new StreamLineNum(sr, file01);
            }

            char out_sep = '\t';

            if (sep != null)
            {
                out_sep = sep[0];
            }

            if (header)
            {
                /* read and write output header if requested */
                string[] h2 = ReadLine(s2, sep, empty, req_fields0);
                if (h2 != null)
                {
                    sw.Write(h2[0]);
                    for (int i = 1; i < h2.Length; i++)
                    {
                        Tuple <int, string[]> h1;
                        if (only_unmatched == false && headers.TryGetValue(i, out h1))
                        {
                            int      key2 = h1.Item1;
                            string[] h1s  = h1.Item2;
                            for (int j = 0; j < h1s.Length; j++)
                            {
                                if (j + 1 != key2)
                                {
                                    sw.Write(out_sep);
                                    sw.Write(h1s[j]);
                                }
                            }
                        }
                        sw.Write(out_sep);
                        sw.Write(h2[i]);
                    }
                    sw.Write('\n');
                }
            }

            UInt64 matched_lines   = 0;
            UInt64 unmatched_lines = 0;

            firstline = true;
            /* temporary index to use for matched fields */
            Tuple <int, string[]>[] matched = new Tuple <int, string[]> [req_fields0];
            while (true)
            {
                // read one line from file0, process it
                string[] line2 = ReadLine(s2, sep, empty, req_fields0);
                if (line2 == null)
                {
                    break;
                }
                if (check_fieldnum)
                {
                    if (firstline)
                    {
                        req_fields0 = line2.Length; firstline = false; matched = new Tuple <int, string[]> [req_fields0];
                    }
                    else if (req_fields0 != line2.Length)
                    {
                        Console.Error.WriteLine("Inconsistent number of fields in file {0} at line {1}!", s2.Fn, s2.Line);
                        return;
                    }
                }

                /* check that all fields to be matched are found */
                bool matched_all = true;
                foreach (var x in dicts)
                {
                    foreach (var y in x.Item3)
                    {
                        string   key = line2[y - 1];                     /* key to use for search */
                        string[] match;
                        if (x.Item2.TryGetValue(key, out match))
                        {
                            matched[y - 1] = new Tuple <int, string[]>(x.Item1, match);
                        }
                        else
                        {
                            matched_all    = false;
                            matched[y - 1] = null;
                            if (skip_missing == false)
                            {
                                Console.Error.WriteLine("Error: key {0} from line {1}, file {2} not found in " +
                                                        "match file {3}!", key, s2.Line, s2.Fn, matchfiles[y].Item1);
                            }
                            break;
                        }
                    }
                    if (!matched_all)
                    {
                        break;
                    }
                }

                /* main output of results if all were matched */
                if (matched_all)
                {
                    for (int i = 0; i < line2.Length; i++)
                    {
                        sw.Write(line2[i]);
                        if (i < matched.Length && matched[i] != null)
                        {
                            int      key2  = matched[i].Item1;
                            string[] line1 = matched[i].Item2;
                            for (int j = 0; j < line1.Length; j++)
                            {
                                if (j + 1 != key2)
                                {
                                    sw.Write(out_sep);
                                    sw.Write(line1[j]);
                                }
                            }
                        }
                        if (i + 1 < line2.Length)
                        {
                            sw.Write(out_sep);
                        }
                    }
                    sw.Write('\n');
                    matched_lines++;
                }
                else
                {
                    if (skip_missing == false)
                    {
                        break;
                    }
                    if (only_unmatched)
                    {
                        /* print the original line */
                        sw.Write(line2[0]);
                        for (int i = 1; i < line2.Length; i++)
                        {
                            sw.Write(out_sep);
                            sw.Write(line2[i]);
                        }
                        sw.Write('\n');
                    }
                    unmatched_lines++;
                }
            }             // main loop

            sr.Close();
            sw.Close();             // flush output


            Console.Error.WriteLine("Matched lines: {0}", matched_lines);
            if (skip_missing || only_unmatched)
            {
                Console.Error.WriteLine("Unmatched lines: {0}", unmatched_lines);
            }
        }
Example #3
0
        public static void Main(string[] args)
        {
            string file1 = null;
            string file2 = null;

            int field1      = 1;
            int field2      = 1;
            int req_fields1 = 1;
            int req_fields2 = 1;

            List <int> outfields1 = null;
            List <int> outfields2 = null;

            char[] sep   = (char[])null;
            string empty = null;

            int  unpaired      = 0;       // if 1 or 2, print unpaired lines from the given file
            bool only_unpaired = false;
            bool header        = false;
            bool unique        = true;

            // process option arguments
            int i = 0;

            for (; i < args.Length; i++)
            {
                if (args[i].Length > 1 && args[i][0] == '-')
                {
                    switch (args[i][1])
                    {
                    case '1':
                        field1 = Int32.Parse(args[i + 1]);
                        i++;
                        break;

                    case '2':
                        field2 = Int32.Parse(args[i + 1]);
                        i++;
                        break;

                    case 'j':
                        field1 = Int32.Parse(args[i + 1]);
                        field2 = field1;
                        i++;
                        break;

                    case 't':
                        sep = new char[1] {
                            args[i + 1][0]
                        };
                        i++;
                        break;

                    case 'e':
                        empty = args[i + 1];
                        i++;
                        break;

                    case 'a':
                        unpaired = Int32.Parse(args[i + 1]);
                        if (!(unpaired == 1 || unpaired == 2))
                        {
                            Console.Error.WriteLine("-a parameter has to be either 1 or 2\n  use hashjoin -h for help\n"); return;
                        }
                        i++;
                        break;

                    case 'v':
                        unpaired = Int32.Parse(args[i + 1]);
                        if (!(unpaired == 1 || unpaired == 2))
                        {
                            Console.Error.WriteLine("-a parameter has to be either 1 or 2\n  use hashjoin -h for help\n"); return;
                        }
                        i++;
                        only_unpaired = true;
                        break;

                    case 'o':
                        if (args[i].Length < 3 || !(args[i][2] == '1' || args[i][2] == '2'))
                        {
                            Console.Error.WriteLine("Invalid parameter: {0}\n  (use -o1 or -o2)\n  use hashjoin -h for help\n", args[i]); return;
                        }
                        {
                            List <int> tmp   = new List <int>();
                            bool       valid = true;
                            int        max   = 0;
                            // it is valid to give zero output columns from one of the files
                            // (e.g. to filter the other file)
                            // this case it might be necessary to give an empty string
                            // as the argument (i.e. -o1 "")
                            if (!(args[i + 1].Length == 0 || args[i + 1][0] == '-'))
                            {
                                string[] stmp = args[i + 1].Split(',');
                                if (stmp.Length == 0)
                                {
                                    valid = false;
                                }
                                foreach (string s in stmp)
                                {
                                    int x;
                                    if (Int32.TryParse(s, out x))
                                    {
                                        if (x < 1)
                                        {
                                            valid = false; break;
                                        }
                                        tmp.Add(x);
                                        if (x > max)
                                        {
                                            max = x;
                                        }
                                    }
                                    else
                                    {
                                        valid = false; break;
                                    }
                                }
                            }
                            if (!valid)
                            {
                                Console.Error.WriteLine("Invalid parameter: {0} {1}\n  use hashjoin -h for help\n", args[i], args[i + 1]); return;
                            }
                            if (args[i][2] == '1')
                            {
                                outfields1 = tmp;
                                if (max > req_fields1)
                                {
                                    req_fields1 = max;
                                }
                            }
                            if (args[i][2] == '2')
                            {
                                outfields2 = tmp;
                                if (max > req_fields2)
                                {
                                    req_fields2 = max;
                                }
                            }
                        }
                        i++;
                        break;

                    case 'H':
                        header = true;
                        break;

                    case 'u':
                        unique = false;
                        break;

                    case 'h':
                        Console.Write("{0}", usage);
                        return;

                    default:
                        Console.Error.WriteLine("Unknown parameter: {0}\n  use hashjoin -h for help\n");
                        return;
                    }
                }
                else
                {
                    break;          // non-option argument, means the filenames
                }
            }
            // i now points to the first filename
            if (i + 1 >= args.Length)
            {
                Console.Error.WriteLine("Error: expecting two input filenames\n  use hashjoin -h for help\n"); return;
            }
            file1 = args[i];
            file2 = args[i + 1];
            if (file1 == file2)
            {
                Console.Error.WriteLine("Error: input files have to be different!\n"); return;
            }

            if (field1 < 1 || field2 < 1)
            {
                Console.Error.WriteLine("Error: field numbers have to be >= 1!\n"); return;
            }

            if (field1 > req_fields1)
            {
                req_fields1 = field1;
            }
            if (field2 > req_fields2)
            {
                req_fields2 = field2;
            }

            StreamWriter sw  = new StreamWriter(Console.OpenStandardOutput());
            StreamReader sr1 = null;
            StreamReader sr2 = null;

            Dictionary <string, File1Line> dict = new Dictionary <string, File1Line>();

            string[] file1header = null;

            // open input files
            if (file1 == "-")
            {
                sr1 = new StreamReader(Console.OpenStandardInput());
            }
            else
            {
                sr1 = new StreamReader(file1);
            }
            if (file2 == "-")
            {
                sr2 = new StreamReader(Console.OpenStandardInput());
            }
            else
            {
                sr2 = new StreamReader(file2);
            }
            StreamLineNum s1 = new StreamLineNum(sr1, file1);
            StreamLineNum s2 = new StreamLineNum(sr2, file2);

            char out_sep = '\t';
            if (sep != null)
            {
                out_sep = sep[0];
            }

            // read all lines from file 1
            if (header)
            {
                file1header = ReadLine(s1, sep, empty, req_fields1);
                if (file1header == null)
                {
                    Console.Error.WriteLine("Error: file 1 is empty (expected header at least!\n"); return;
                }
            }

            while (true)
            {
                string[] l1 = ReadLine(s1, sep, empty, req_fields1);
                if (l1 == null)
                {
                    break;                            // end of file
                }
                string key = l1[field1 - 1];          // note: we already checked that l1 has at least field1 fields in ReadLine()
                if (dict.ContainsKey(key))
                {
                    if (unique)
                    {
                        Console.Error.WriteLine("Duplicate key in file 1 ({0}): {1} on line {2}!\n", file1, key, s1.Line); return;
                    }
                    dict[key].lines.Add(l1);
                }
                else
                {
                    File1Line l2 = new File1Line();
                    l2.lines.Add(l1);
                    dict.Add(key, l2);
                }
            }
            sr1.Close();

            if (header)
            {
                // read and write output header
                string[] h2 = ReadLine(s2, sep, empty, req_fields2);
                string[] h1 = file1header;
                if (h1 != null && h2 != null)
                {
                    bool firstout = true;
                    WriteFields(sw, h1, outfields1, ref firstout, out_sep);
                    WriteFields(sw, h2, outfields2, ref firstout, out_sep);
                    sw.Write('\n');
                }
            }

            UInt64 out_lines = 0;
            UInt64 matched1  = 0;
            UInt64 matched2  = 0;
            UInt64 unmatched = 0;
            while (true)
            {
                // read one line from file 2, process it
                string[] line2 = ReadLine(s2, sep, empty, req_fields2);
                if (line2 == null)
                {
                    break;
                }
                string key = line2[field2 - 1];
                if (dict.ContainsKey(key))
                {
                    File1Line match = dict[key];
                    if (!only_unpaired)
                    {
                        if (!match.seen)
                        {
                            matched1 += (UInt64)match.lines.Count;
                        }
                        foreach (string[] line1 in match.lines)
                        {
                            bool firstout = true;

                            // write out fields from the first file
                            WriteFields(sw, line1, outfields1, ref firstout, out_sep);
                            WriteFields(sw, line2, outfields2, ref firstout, out_sep);
                            sw.Write('\n');
                            out_lines++;
                        }
                    }
                    match.seen = true;
                    matched2++;
                }
                else if (unpaired == 2)
                {
                    // still print unpaired lines from file 2
                    bool firstout = true;
                    // note: we write empty fields for file 1
                    if (outfields1 != null)
                    {
                        WriteFields(sw, null, outfields1, ref firstout, out_sep);
                    }
                    WriteFields(sw, line2, outfields2, ref firstout, out_sep);
                    sw.Write('\n');
                    out_lines++;
                    unmatched++;
                }
            }             // main loop

            sr2.Close();

            // write out unmatched lines from file 1 if needed
            if (unpaired == 1)
            {
                foreach (File1Line x in dict.Values)
                {
                    if (x.seen == false)
                    {
                        foreach (string[] line1 in x.lines)
                        {
                            // still print unpaired lines from file 1
                            bool firstout = true;
                            WriteFields(sw, line1, outfields1, ref firstout, out_sep);
                            // note: we write empty fields for file 2
                            if (outfields2 != null)
                            {
                                WriteFields(sw, null, outfields2, ref firstout, out_sep);
                            }
                            sw.Write('\n');
                            out_lines++;
                            unmatched++;
                        }
                    }
                }
            }


            sw.Close();             // flush output

            Console.Error.WriteLine("Matched lines from file 1: {0}", matched1);
            Console.Error.WriteLine("Matched lines from file 2: {0}", matched2);
            if (unmatched > 0)
            {
                switch (unpaired)
                {
                case 1:
                    Console.Error.WriteLine("Unmatched lines from file 1: {0}", unmatched);
                    break;

                case 2:
                    Console.Error.WriteLine("Unmatched lines from file 2: {0}", unmatched);
                    break;
                }
            }
            Console.Error.WriteLine("Total lines output: {0}", out_lines);
        }