clean_data.py revision 50ef334ec9d72f3bfa06be51a493a1ff8f4c3162
1#! /usr/bin/python
2
3"""Cleans output from other scripts to eliminate duplicates.
4
5When frequently sampling data, we see that records occasionally will contain
6the same timestamp (due to perf recording twice in the same second).
7
8This removes all of the duplicate timestamps for every record. Order with
9respect to timestamps is not preserved. Also, the assumption is that the log
10file is a csv with the first value in each row being the time in seconds from a
11standard time.
12
13"""
14
15import argparse
16
17parser = argparse.ArgumentParser()
18parser.add_argument("filename")
19args = parser.parse_args()
20
21my_file = open(args.filename)
22output_file = open("clean2.csv", "a")
23dictionary = dict()
24
25for line in my_file:
26    new_time = int(line.split(",")[0])
27    dictionary[new_time] = line
28
29for key in dictionary.keys():
30    output_file.write(dictionary[key])
31