NevilleDNZ: May 2018

The reason people use SQL is because you can run short queries like this:

Example Query:

$ SELECT country customer order 'sum{{price*quantity}}' --FROM cust_order_d.py --WHERE 'price*quantity>0' --GROUP_BY country,customer,order --HAVING 'sum{{price*quantity}}>0' --ORDER_BY 'sum{{price*quantity}}'

And get the answer without writing too much code...

Example INPUT cust_order_d.py with python or simple JASON

[

  {'country': 'NZ', 'customer': 'Andrew', 'item': 'Apples', 'order': 1, 'price': 5.0, 'quantity': 2.0},
  {'country': 'NZ', 'customer': 'Andrew', 'item': 'Bananas', 'order': 1, 'price': 1.0, 'quantity': 5.0},
  {'country': 'NZ', 'customer': 'Andrew', 'item': 'Carrots', 'order': 2, 'price': 2.5, 'quantity': 2.0},
  {'country': 'NZ', 'customer': 'Brenda', 'item': 'Apples', 'order': 3, 'price': 5.0, 'quantity': 1.0},
  {'country': 'NZ', 'customer': 'Brenda', 'item': 'Banana', 'order': 3, 'price': 1.0, 'quantity': 1.0},
  {'country': 'NZ', 'customer': 'Brenda', 'item': 'Apples', 'order': 4, 'price': 5.0, 'quantity': 1.0},
  {'country': 'NZ', 'customer': 'Brenda', 'item': 'Carrots', 'order': 4, 'price': 2.5, 'quantity': 1.0},
  {'country': 'AU', 'customer': 'Carol', 'item': 'Apples', 'order': 5, 'price': 5.0, 'quantity': 3.0},
  {'country': 'AU', 'customer': 'Carol', 'item': 'Bananas', 'order': 5, 'price': 1.0, 'quantity': 6.0},
  {'country': 'AU', 'customer': 'Carol', 'item': 'Carrots', 'order': 6, 'price': 2.5, 'quantity': 4.0}
]

Example Output:

[[('country', 'NZ'), ('customer', 'Andrew'), ('order', 2), ("sum(agg['price*quantity'])", 5.0)],
 [('country', 'NZ'), ('customer', 'Brenda'), ('order', 3), ("sum(agg['price*quantity'])", 6.0)],
 [('country', 'NZ'), ('customer', 'Brenda'), ('order', 4), ("sum(agg['price*quantity'])", 7.5)],
 [('country', 'AU'), ('customer', 'Carol'), ('order', 6), ("sum(agg['price*quantity'])", 10.0)],
 [('country', 'NZ'), ('customer', 'Andrew'), ('order', 1), ("sum(agg['price*quantity'])", 15.0)],
 [('country', 'AU'), ('customer', 'Carol'), ('order', 5), ("sum(agg['price*quantity'])", 21.0)]]

Note that the order sub-total as correctly calculated and sorted on the right hand side...

And that didn't even hurt... :-)

This is only Alpha code, but I am happy to release the code under the following license...

Attribution-NonCommercial-NoDerivs 3.0 Australia(CC BY-NC-ND 3.0 AU)

Here is the Code:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
end=fi=od=done=esac=yrt=htiw=fed=ssalc=lambda *skip: skip # ¢ replace Bourne's "done" with "od" & add 2¢
import re,time,os,sys,time,string,pprint,pdb
pp=pprint.pprint; width=132
debug=False

from itertools import *
from collections import *

def get_group_by_func(*field_l):
  def out(record_d): 
    return [ record_d[field] if isinstance(field,str) else field(record_d) for field in field_l ]
  return out 

def get_order_by_func(*field_l):
  key_func=get_group_by_func(*field_l)
  return lambda a,b: cmp(key_func(a), key_func(b))

def is_solo_arg(arg):
  return isinstance(arg,(str,unicode))

class PathDict(object):
  def __init__(self,local_d_l=[],global_d={}): 
    self.local_d_l=local_d_l # .copy()???
    if global_d: self.local_d_l+=global_d,

  def __getitem__(self,key):
    for d in self.local_d_l:
      try: return d[key]
      except KeyError as error: pass
    else: raise
  #get=__getitem__

re_var_name=re.compile("^[a-z_][a-z0-9_]*$",re.IGNORECASE)

def threaded_eval(code_l,global_d,local_d_l):
  #for code in code_l: yield eval(code,global_d, local_d_l)
  try: 
    return [ 
      (PathDict(local_d_l=local_d_l,global_d=global_d)[code] 
        if re_var_name.match(code) 
        else eval(code,global_d,PathDict(local_d_l=local_d_l))) 
          for code in code_l ]
    return [ eval(code,global_d,PathDict(local_d_l)) for code in code_l ]
  except Exception as error: 
    pp((error, code_l, local_d_l[:-2]), width=width)
    print('TRACE mode: enter "c" to continue');pdb.set_trace()

def get_agg_dict(key_l, value_l_g):
  if False and len(key_l)==1: # False? <= sometime the same agg is used in 2 of SHO :-(
    return {key_l[0]:(value_l[0] for value_l in value_l_g)}
  else: # ideally this next line would use pythreading
    return dict(zip(key_l,zip(*list(value_l_g))))

def trace_group_by(gen_d, group_func):
  out_l=[]
  for enum, d in enumerate(gen_d): 
    k=group_func(d)
    if not enum or k==prev_k:
      out_l+=d,
    else:
      yield prev_k, out_l
      out_l=[d]
    prev_k=k
  if out_l: yield prev_k, out_l
      
group_by=groupby

re_parse_middle_expr=re.compile(r"{{((?:[^{}]|}[^}]|{[^{])*)}}")  # {{...}}  to look up a agg

def parse_agg_expr(expr):
  tok_l=re_parse_middle_expr.split(expr)
  inner_l=tok_l[1::2]
  tok_l[1::2]=["(agg[%r])"%inner for inner in inner_l]
  inner_d=OrderedDict(zip(inner_l,tok_l[1::2]))
  outer="".join(tok_l)
  return outer,inner_d

def open_from(from_name):
  if not isinstance(from_name,(unicode,str)):
    return from_name
  else:
    if from_name.endswith(".py"):
      return eval("".join(open(from_name,"r")),globals())
    elif from_name=="-":
      return sys.stdin

def gen_d_from_csv(col_key_l, csv_str, sep=",", conv_d={}):
  for line in csv_str.splitlines():
    if line:
      out=OrderedDict(zip(col_key_l,line.split(sep)))
      for key_l,conv_t in conv_d.items():
        for key in key_l.split():
          out[key]=conv_t(out[key])
      yield out

def QUERY(SELECT=None,FROM=None,WHERE=None,GROUP_BY=None,HAVING=None,ORDER_BY=None):
  if is_solo_arg(SELECT): SELECT=SELECT.split(",")
  if is_solo_arg(GROUP_BY): GROUP_BY=GROUP_BY.split(",")

  if not HAVING: HAVING=[]
  if is_solo_arg(HAVING): HAVING=HAVING.split(",")

  if not ORDER_BY: ORDER_BY=[]
  if is_solo_arg(ORDER_BY): ORDER_BY=ORDER_BY.split(",")

  if not ORDER_BY: 
    ORDER_BY=[]
  else:
    shared_eval_d_l=[]

  SELECT=list(SELECT)
  FROM=list(open_from(FROM))
  if debug: pp((dict(
    SELECT=SELECT,
    FROM=FROM,
    WHERE=WHERE,
    GROUP_BY=GROUP_BY,
    HAVING=HAVING,
    ORDER_BY=ORDER_BY,
  )),width=width)

  if WHERE: record_d_where_g=( record_d for record_d in FROM if eval(WHERE,globals(), record_d) )
  else: record_d_where_g=FROM

  SHO=dict(SELECT=SELECT, HAVING=HAVING, ORDER_BY=ORDER_BY,)

  outer_SHO_code_d=OrderedDict()
  inner_SHO_agg_code_d=OrderedDict()

  for keyword,expr_l in SHO.items():
    for enum,expr in enumerate(expr_l):
      outer,inner_d=parse_agg_expr(expr)
      SHO[keyword][enum]=outer
      outer_SHO_code_d[outer]=expr
      inner_SHO_agg_code_d.update(inner_d)
  """
Wikipedia: The clauses of a query have a particular order of
execution[7], which is denoted by the number on the right hand
side. It is as follows:
* SELECT <columns>: 5
* FROM <table>: 1
* WHERE <predicate on rows>: 2
* GROUP BY <columns>: 3
* HAVING <predicate on groups>: 4
* ORDER BY <columns>: 6
  """
  if not GROUP_BY:
    out_g=record_d_where_g
  else:
    group_by_func=get_group_by_func(*GROUP_BY)
    for group_value_l, group_record_d_g in group_by(record_d_where_g, group_by_func):
      group_d=dict(zip(GROUP_BY,group_value_l))
# Finally!!! aggregate...
      inner_agg_val_l_g=(threaded_eval(code_l=inner_SHO_agg_code_d.keys(),global_d=globals(),local_d_l=[group_record_d]) 
        for group_record_d in group_record_d_g )
      inner_agg_val_l_of_agg_code=get_agg_dict(inner_SHO_agg_code_d.keys(), inner_agg_val_l_g)
      shared_eval_d=dict(zip(outer_SHO_code_d.keys(),
        threaded_eval(code_l=outer_SHO_code_d.keys(),
        global_d=globals(), local_d_l=[group_d, dict(agg=inner_agg_val_l_of_agg_code)]))
      )
      if HAVING and not shared_eval_d[HAVING[0]]: continue
      if ORDER_BY: shared_eval_d_l+=shared_eval_d, # note the ","
      else: yield[(select,shared_eval_d[select]) for select in SELECT]

  if ORDER_BY: 
    shared_eval_d_l.sort(get_order_by_func(*ORDER_BY))
    for shared_eval_d in shared_eval_d_l: 
      yield[(select,shared_eval_d[select]) for select in SELECT]
      #yield[shared_eval_d[select] for select in SELECT]

####################################################################
# Unit Test Section
####################################################################
col_key_l="customer country order item price quantity".split()

order_str="""
Andrew,NZ,1,Apples,5.00,2
Andrew,NZ,1,Bananas,1.00,5
Andrew,NZ,2,Carrots,2.50,2
Brenda,NZ,3,Apples,5.00,1
Brenda,NZ,3,Banana,1.00,1
Brenda,NZ,4,Apples,5.00,1
Brenda,NZ,4,Carrots,2.50,1
Carol,AU,5,Apples,5.00,3
Carol,AU,5,Bananas,1.00,6
Carol,AU,6,Carrots,2.50,4
"""
cust_order_product_detail_d_g=gen_d_from_csv(col_key_l, order_str, conv_d={"price quantity":float, "order":int})

OrderedDict=dict

def unittest_1a(): # an example of call from another module
  ans=QUERY(
    SELECT="country,customer,order,sum{{price*quantity}}",
    FROM=cust_order_product_detail_d_g,
    WHERE="price*quantity>0.1",
    GROUP_BY="country,customer,order",
    HAVING='sum{{price*quantity}}>0',
    ORDER_BY='sum{{price*quantity}}',
  )
  pp(list(ans),width=width)

if __name__=="__main__":
  # unittest_1a(); sys.exit()
# https://stackoverflow.com/questions/3217673/why-use-argparse-rather-than-optparse
  import argparse
  parser = argparse.ArgumentParser(description='Pull target_ls from an html file or URL.', epilog="Good luck.")
  parser.add_argument('SELECT', nargs='+', help="columns to select")
  parser.add_argument("--debug","-X", action="store_true", help="drop into debug if there is a problem")
  parser.add_argument("--FROM","-F", default="-", help="name of table to query")
  parser.add_argument("--WHERE","-W", default=None, help="predcate on rows")
  parser.add_argument("--GROUP_BY","-G", default=None, help="columns to group")
  parser.add_argument("--HAVING","-H", default=None, help="predcate on groups")
  parser.add_argument("--ORDER_BY","-O", default=None, help="columns to order")

  arg_d = parser.parse_args()

  debug=arg_d.debug

  ans=QUERY(
    SELECT=arg_d.SELECT,
    # FROM=cust_order_product_detail_d_g,
    FROM=arg_d.FROM,
    WHERE=arg_d.WHERE,
    GROUP_BY=arg_d.GROUP_BY,
    HAVING=arg_d.HAVING,
    ORDER_BY=arg_d.ORDER_BY,
  )
  pp(list(ans),width=width)

NevilleDNZ

Sunday, May 6, 2018

How to: Run a super simple SQL query from inside Python, or even from the command line...

Example Query:

Example INPUT cust_order_d.py with python or simple JASON

Example Output:

Attribution-NonCommercial-NoDerivs 3.0 Australia(CC BY-NC-ND 3.0 AU)

Here is the Code:

About Me