The reason people use SQL is because you can run short queries like this:
Example Query:
$ SELECT country customer order 'sum{{price*quantity}}' --FROM cust_order_d.py --WHERE 'price*quantity>0' --GROUP_BY country,customer,order --HAVING 'sum{{price*quantity}}>0' --ORDER_BY 'sum{{price*quantity}}'And get the answer without writing too much code...
Example INPUT cust_order_d.py with python or simple JASON
[{'country': 'NZ', 'customer': 'Andrew', 'item': 'Apples', 'order': 1, 'price': 5.0, 'quantity': 2.0}, {'country': 'NZ', 'customer': 'Andrew', 'item': 'Bananas', 'order': 1, 'price': 1.0, 'quantity': 5.0}, {'country': 'NZ', 'customer': 'Andrew', 'item': 'Carrots', 'order': 2, 'price': 2.5, 'quantity': 2.0}, {'country': 'NZ', 'customer': 'Brenda', 'item': 'Apples', 'order': 3, 'price': 5.0, 'quantity': 1.0}, {'country': 'NZ', 'customer': 'Brenda', 'item': 'Banana', 'order': 3, 'price': 1.0, 'quantity': 1.0}, {'country': 'NZ', 'customer': 'Brenda', 'item': 'Apples', 'order': 4, 'price': 5.0, 'quantity': 1.0}, {'country': 'NZ', 'customer': 'Brenda', 'item': 'Carrots', 'order': 4, 'price': 2.5, 'quantity': 1.0}, {'country': 'AU', 'customer': 'Carol', 'item': 'Apples', 'order': 5, 'price': 5.0, 'quantity': 3.0}, {'country': 'AU', 'customer': 'Carol', 'item': 'Bananas', 'order': 5, 'price': 1.0, 'quantity': 6.0}, {'country': 'AU', 'customer': 'Carol', 'item': 'Carrots', 'order': 6, 'price': 2.5, 'quantity': 4.0} ]
Example Output:
[[('country', 'NZ'), ('customer', 'Andrew'), ('order', 2), ("sum(agg['price*quantity'])", 5.0)], [('country', 'NZ'), ('customer', 'Brenda'), ('order', 3), ("sum(agg['price*quantity'])", 6.0)], [('country', 'NZ'), ('customer', 'Brenda'), ('order', 4), ("sum(agg['price*quantity'])", 7.5)], [('country', 'AU'), ('customer', 'Carol'), ('order', 6), ("sum(agg['price*quantity'])", 10.0)], [('country', 'NZ'), ('customer', 'Andrew'), ('order', 1), ("sum(agg['price*quantity'])", 15.0)], [('country', 'AU'), ('customer', 'Carol'), ('order', 5), ("sum(agg['price*quantity'])", 21.0)]]
Note that the order sub-total as correctly calculated and sorted on the right hand side... And that didn't even hurt... :-)
This is only Alpha code, but I am happy to release the code under the following license...
Attribution-NonCommercial-NoDerivs 3.0 Australia(CC BY-NC-ND 3.0 AU)
Here is the Code:
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function end=fi=od=done=esac=yrt=htiw=fed=ssalc=lambda *skip: skip # ¢ replace Bourne's "done" with "od" & add 2¢ import re,time,os,sys,time,string,pprint,pdb pp=pprint.pprint; width=132 debug=False from itertools import * from collections import * def get_group_by_func(*field_l): def out(record_d): return [ record_d[field] if isinstance(field,str) else field(record_d) for field in field_l ] return out def get_order_by_func(*field_l): key_func=get_group_by_func(*field_l) return lambda a,b: cmp(key_func(a), key_func(b)) def is_solo_arg(arg): return isinstance(arg,(str,unicode)) class PathDict(object): def __init__(self,local_d_l=[],global_d={}): self.local_d_l=local_d_l # .copy()??? if global_d: self.local_d_l+=global_d, def __getitem__(self,key): for d in self.local_d_l: try: return d[key] except KeyError as error: pass else: raise #get=__getitem__ re_var_name=re.compile("^[a-z_][a-z0-9_]*$",re.IGNORECASE) def threaded_eval(code_l,global_d,local_d_l): #for code in code_l: yield eval(code,global_d, local_d_l) try: return [ (PathDict(local_d_l=local_d_l,global_d=global_d)[code] if re_var_name.match(code) else eval(code,global_d,PathDict(local_d_l=local_d_l))) for code in code_l ] return [ eval(code,global_d,PathDict(local_d_l)) for code in code_l ] except Exception as error: pp((error, code_l, local_d_l[:-2]), width=width) print('TRACE mode: enter "c" to continue');pdb.set_trace() def get_agg_dict(key_l, value_l_g): if False and len(key_l)==1: # False? <= sometime the same agg is used in 2 of SHO :-( return {key_l[0]:(value_l[0] for value_l in value_l_g)} else: # ideally this next line would use pythreading return dict(zip(key_l,zip(*list(value_l_g)))) def trace_group_by(gen_d, group_func): out_l=[] for enum, d in enumerate(gen_d): k=group_func(d) if not enum or k==prev_k: out_l+=d, else: yield prev_k, out_l out_l=[d] prev_k=k if out_l: yield prev_k, out_l group_by=groupby re_parse_middle_expr=re.compile(r"{{((?:[^{}]|}[^}]|{[^{])*)}}") # {{...}} to look up a agg def parse_agg_expr(expr): tok_l=re_parse_middle_expr.split(expr) inner_l=tok_l[1::2] tok_l[1::2]=["(agg[%r])"%inner for inner in inner_l] inner_d=OrderedDict(zip(inner_l,tok_l[1::2])) outer="".join(tok_l) return outer,inner_d def open_from(from_name): if not isinstance(from_name,(unicode,str)): return from_name else: if from_name.endswith(".py"): return eval("".join(open(from_name,"r")),globals()) elif from_name=="-": return sys.stdin def gen_d_from_csv(col_key_l, csv_str, sep=",", conv_d={}): for line in csv_str.splitlines(): if line: out=OrderedDict(zip(col_key_l,line.split(sep))) for key_l,conv_t in conv_d.items(): for key in key_l.split(): out[key]=conv_t(out[key]) yield out def QUERY(SELECT=None,FROM=None,WHERE=None,GROUP_BY=None,HAVING=None,ORDER_BY=None): if is_solo_arg(SELECT): SELECT=SELECT.split(",") if is_solo_arg(GROUP_BY): GROUP_BY=GROUP_BY.split(",") if not HAVING: HAVING=[] if is_solo_arg(HAVING): HAVING=HAVING.split(",") if not ORDER_BY: ORDER_BY=[] if is_solo_arg(ORDER_BY): ORDER_BY=ORDER_BY.split(",") if not ORDER_BY: ORDER_BY=[] else: shared_eval_d_l=[] SELECT=list(SELECT) FROM=list(open_from(FROM)) if debug: pp((dict( SELECT=SELECT, FROM=FROM, WHERE=WHERE, GROUP_BY=GROUP_BY, HAVING=HAVING, ORDER_BY=ORDER_BY, )),width=width) if WHERE: record_d_where_g=( record_d for record_d in FROM if eval(WHERE,globals(), record_d) ) else: record_d_where_g=FROM SHO=dict(SELECT=SELECT, HAVING=HAVING, ORDER_BY=ORDER_BY,) outer_SHO_code_d=OrderedDict() inner_SHO_agg_code_d=OrderedDict() for keyword,expr_l in SHO.items(): for enum,expr in enumerate(expr_l): outer,inner_d=parse_agg_expr(expr) SHO[keyword][enum]=outer outer_SHO_code_d[outer]=expr inner_SHO_agg_code_d.update(inner_d) """ Wikipedia: The clauses of a query have a particular order of execution[7], which is denoted by the number on the right hand side. It is as follows: * SELECT <columns>: 5 * FROM <table>: 1 * WHERE <predicate on rows>: 2 * GROUP BY <columns>: 3 * HAVING <predicate on groups>: 4 * ORDER BY <columns>: 6 """ if not GROUP_BY: out_g=record_d_where_g else: group_by_func=get_group_by_func(*GROUP_BY) for group_value_l, group_record_d_g in group_by(record_d_where_g, group_by_func): group_d=dict(zip(GROUP_BY,group_value_l)) # Finally!!! aggregate... inner_agg_val_l_g=(threaded_eval(code_l=inner_SHO_agg_code_d.keys(),global_d=globals(),local_d_l=[group_record_d]) for group_record_d in group_record_d_g ) inner_agg_val_l_of_agg_code=get_agg_dict(inner_SHO_agg_code_d.keys(), inner_agg_val_l_g) shared_eval_d=dict(zip(outer_SHO_code_d.keys(), threaded_eval(code_l=outer_SHO_code_d.keys(), global_d=globals(), local_d_l=[group_d, dict(agg=inner_agg_val_l_of_agg_code)])) ) if HAVING and not shared_eval_d[HAVING[0]]: continue if ORDER_BY: shared_eval_d_l+=shared_eval_d, # note the "," else: yield[(select,shared_eval_d[select]) for select in SELECT] if ORDER_BY: shared_eval_d_l.sort(get_order_by_func(*ORDER_BY)) for shared_eval_d in shared_eval_d_l: yield[(select,shared_eval_d[select]) for select in SELECT] #yield[shared_eval_d[select] for select in SELECT] #################################################################### # Unit Test Section #################################################################### col_key_l="customer country order item price quantity".split() order_str=""" Andrew,NZ,1,Apples,5.00,2 Andrew,NZ,1,Bananas,1.00,5 Andrew,NZ,2,Carrots,2.50,2 Brenda,NZ,3,Apples,5.00,1 Brenda,NZ,3,Banana,1.00,1 Brenda,NZ,4,Apples,5.00,1 Brenda,NZ,4,Carrots,2.50,1 Carol,AU,5,Apples,5.00,3 Carol,AU,5,Bananas,1.00,6 Carol,AU,6,Carrots,2.50,4 """ cust_order_product_detail_d_g=gen_d_from_csv(col_key_l, order_str, conv_d={"price quantity":float, "order":int}) OrderedDict=dict def unittest_1a(): # an example of call from another module ans=QUERY( SELECT="country,customer,order,sum{{price*quantity}}", FROM=cust_order_product_detail_d_g, WHERE="price*quantity>0.1", GROUP_BY="country,customer,order", HAVING='sum{{price*quantity}}>0', ORDER_BY='sum{{price*quantity}}', ) pp(list(ans),width=width) if __name__=="__main__": # unittest_1a(); sys.exit() # https://stackoverflow.com/questions/3217673/why-use-argparse-rather-than-optparse import argparse parser = argparse.ArgumentParser(description='Pull target_ls from an html file or URL.', epilog="Good luck.") parser.add_argument('SELECT', nargs='+', help="columns to select") parser.add_argument("--debug","-X", action="store_true", help="drop into debug if there is a problem") parser.add_argument("--FROM","-F", default="-", help="name of table to query") parser.add_argument("--WHERE","-W", default=None, help="predcate on rows") parser.add_argument("--GROUP_BY","-G", default=None, help="columns to group") parser.add_argument("--HAVING","-H", default=None, help="predcate on groups") parser.add_argument("--ORDER_BY","-O", default=None, help="columns to order") arg_d = parser.parse_args()
debug=arg_d.debug
ans=QUERY( SELECT=arg_d.SELECT, # FROM=cust_order_product_detail_d_g, FROM=arg_d.FROM, WHERE=arg_d.WHERE, GROUP_BY=arg_d.GROUP_BY, HAVING=arg_d.HAVING, ORDER_BY=arg_d.ORDER_BY, ) pp(list(ans),width=width)