forked from Crunch-io/crunchdb-kata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.py
executable file
·135 lines (115 loc) · 4.46 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
"""Answer queries."""
import collections
import gzip
import json
import os
import pathlib
# import pprint
import sys
# import pudb
import unidecode
import constants
def ascii_encode_dict(data):
"""
Insist on ASCII data, so we can print to the console during debugging.
Assumes 'data' isn't nested at all.
Based on https://stackoverflow.com/questions/9590382/forcing-python-json-module-to-work-with-ascii
"""
# This is converting the data to bytestrings at the moment. But it's close to what I want, so I'm committing it anyway.
def ascii_encode(string):
assert isinstance(string, str)
result = unidecode.unidecode(string)
return result
return {ascii_encode(key): ascii_encode(value) for key, value in data.items()}
def get_json_documents(convert=True):
"""Return new JSON documents, at least 50 at a time."""
# We assume a single consumer process.
path = pathlib.Path('json-data')
json_filenames = list(path.glob('*.json.gz'))
if json_filenames[50:]:
for json_filename in json_filenames:
with gzip.open(json_filename, "r") as answerfile:
bytes_data = answerfile.read()
os.unlink(json_filename)
str_data = bytes_data.decode('UTF-8')
if convert:
# This is nice for debugging
json_dict = json.loads(str_data, object_hook=ascii_encode_dict)
else:
# This is nice for preserving foreign-language text
json_dict = json.loads(str_data)
yield json_dict
def usage(retval):
"""Output a usage message."""
if retval == 0:
file_ = sys.stdout
else:
file_ = sys.stderr
print('{}: Must specify exactly one of:'.format(sys.argv[0]), file=file_)
print(' --most-frequently-owned-car-brand', file=file_)
print(' --favorite-car-brand', file=file_)
print(' --most-frequently-listened-music-artist', file=file_)
print(' --favourite-music-artist', file=file_)
sys.exit(retval)
def main():
"""Get the ball rolling."""
# pudb.set_trace()
query = ''
query_count = 0
while sys.argv[1:]:
if sys.argv[1] == '--most-frequently-owned-car-brand':
query_count += 1
query = 'owned-car-brand'
elif sys.argv[1] == '--favorite-car-brand':
query_count += 1
query = 'favourite-car-brand'
elif sys.argv[1] == '--most-frequently-listened-music-artist':
query_count += 1
query = 'listened-music-artist'
elif sys.argv[1] == '--favourite-music-artist':
query_count += 1
query = 'favourite-music-artist'
elif sys.argv[1] in ('-h', '--help'):
usage(0)
else:
print('{}: Unrecognized option: {}'.format(sys.argv[0], sys.argv[1]), file=sys.stderr)
usage(1)
del sys.argv[1]
if query_count != 1:
usage(1)
counter = collections.Counter()
for json_docno, json_document in enumerate(get_json_documents()):
print(json_docno, file=sys.stderr)
for key, value in json_document.items():
base, dot, name = key.partition('.')
if dot != '.':
# print(key)
continue
assert '.' not in base
assert dot == '.', "dot is {}".format(dot)
assert name != ''
# pudb.set_trace()
# Counter({'listened_singers': 558224,
# 'known_singers': 558224,
# 'disliked_singers': 558224,
# 'liked_cars': 272084,
# 'owned_cars': 272084,
# 'ever_owned_cars': 272084})
# pudb.set_trace()
if (
query == 'owned-car-brand' and base in constants.mrcarvars and value == 'yes' or
query == 'favourite-car-brand' and base in constants.mrcarvars and value == 'yes' or
query == 'listened-music-artist' and base in constants.mrsingervars and value == 'yes' or
query == 'favourite-music-artist' and base in constants.singervars and value == 'yes'
):
# FIXME: Do we have no favourite music artists in our sample data?
# print(name, value, file=sys.stderr)
counter[name] += 1
list_ = counter.most_common(1)
if len(list_) != 0:
print(list_[0])
else:
print('No occurences found', file=sys.stderr)
if __name__ == '__main__':
main()