diff --git a/core/query_parser.py b/core/query_parser.py new file mode 100644 index 0000000..1ac3796 --- /dev/null +++ b/core/query_parser.py @@ -0,0 +1,23 @@ +from core.ast.node import QueryNode + +class QueryParser: + + def parse(self, query: str) -> QueryNode: + # Implement parsing logic using self.rules + pass + + # [1] Call mo_sql_parser + # str -> Any (JSON) + + # [2] Our new code + # Any (JSON) -> AST (QueryNode) + + def format(self, query: QueryNode) -> str: + # Implement formatting logic to convert AST back to SQL string + pass + + # [1] Our new code + # AST (QueryNode) -> JSON + + # [2] Call mo_sql_format + # Any (JSON) -> str \ No newline at end of file diff --git a/data/queries.py b/data/queries.py new file mode 100644 index 0000000..dd1c8ed --- /dev/null +++ b/data/queries.py @@ -0,0 +1,980 @@ +queries = [ + { + 'id': 1, + 'name': 'Remove Cast Date Match Twice', + 'pattern': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE CAST(DATE_TRUNC('QUARTER', + CAST(created_at AS DATE)) + AS DATE) IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND (STRPOS(text, 'iphone') > 0) + GROUP BY 2; + ''', + 'rewrite': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE DATE_TRUNC('QUARTER', created_at) + IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND (STRPOS(text, 'iphone') > 0) + GROUP BY 2; + ''' + }, + + { + 'id': 2, + 'name': 'Remove Cast Date Match Once', + 'pattern': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE DATE_TRUNC('QUARTER', + CAST(created_at AS DATE)) + IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND (STRPOS(text, 'iphone') > 0) + GROUP BY 2; + ''', + 'rewrite': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE DATE_TRUNC('QUARTER', created_at) + IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND (STRPOS(text, 'iphone') > 0) + GROUP BY 2; + ''' + }, + + { + 'id': 3, + 'name': 'Remove Cast Date No Match', + 'pattern': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE DATE_TRUNC('QUARTER', created_at) + IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND (STRPOS(text, 'iphone') > 0) + GROUP BY 2; + ''', + 'rewrite': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE DATE_TRUNC('QUARTER', created_at) + IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND (STRPOS(text, 'iphone') > 0) + GROUP BY 2; + ''' + }, + + { + 'id': 4, + 'name': 'Replace Strpos Lower Match', + 'pattern': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE CAST(DATE_TRUNC('QUARTER', + CAST(created_at AS DATE)) + AS DATE) IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND (STRPOS(LOWER(text), 'iphone') > 0) + GROUP BY 2; + ''', + 'rewrite': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE CAST(DATE_TRUNC('QUARTER', + CAST(created_at AS DATE)) + AS DATE) IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND text ILIKE '%iphone%' + GROUP BY 2; + ''' + }, + + { + 'id': 5, + 'name': 'Replace Strpos Lower No Match', + 'pattern': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE DATE_TRUNC('QUARTER', + CAST(created_at AS DATE)) + IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND text ILIKE '%iphone%' + GROUP BY 2; + ''', + 'rewrite': ''' + SELECT SUM(1), + CAST(state_name AS TEXT) + FROM tweets + WHERE DATE_TRUNC('QUARTER', + CAST(created_at AS DATE)) + IN + ((TIMESTAMP '2016-10-01 00:00:00.000'), + (TIMESTAMP '2017-01-01 00:00:00.000'), + (TIMESTAMP '2017-04-01 00:00:00.000')) + AND text ILIKE '%iphone%' + GROUP BY 2; + ''' + }, + + { + 'id': 6, + 'name': 'Remove Self Join Match', + 'pattern': ''' + SELECT e1.name, + e1.age, + e2.salary + FROM employee e1, employee e2 + WHERE e1.id = e2.id + AND e1.age > 17 + AND e2.salary > 35000; + ''', + 'rewrite': ''' + SELECT e1.name, + e1.age, + e1.salary + FROM employee e1 + WHERE 1=1 + AND e1.age > 17 + AND e1.salary > 35000; + ''' + }, + + { + 'id': 7, + 'name': 'Remove Self Join No Match', + 'pattern': ''' + SELECT e1.name, + e1.age, + e1.salary + FROM employee e1 + WHERE e1.age > 17 + AND e1.salary > 35000; + ''', + 'rewrite': ''' + SELECT e1.name, + e1.age, + e1.salary + FROM employee e1 + WHERE e1.age > 17 + AND e1.salary > 35000; + ''' + }, + + { + 'id': 8, + 'name': 'Remove Self Join Match Simple', + 'pattern': ''' + SELECT e1.age + FROM employee e1, employee e2 + WHERE e1.id = e2.id + AND e1.age > 17; + ''', + 'rewrite': ''' + SELECT e1.age + FROM employee e1 + WHERE 1=1 + AND e1.age > 17; + ''' + }, + + { + 'id': 9, + 'name': 'Subquery to Join Match 1', + 'pattern': ''' + select empno, firstnme, lastname, phoneno + from employee + where workdept in + (select deptno + from department + where deptname = 'OPERATIONS') + and 1=1; + ''', + 'rewrite': ''' + select distinct empno, firstnme, lastname, phoneno + from employee, department + where employee.workdept = department.deptno + and deptname = 'OPERATIONS' + and 1=1; + ''' + }, + + { + 'id': 10, + 'name': 'Subquery to Join Match 2', + 'pattern': ''' + select empno, firstnme, lastname, phoneno + from employee + where workdept in + (select deptno + from department + where deptname = 'OPERATIONS') + and age > 17; + ''', + 'rewrite': ''' + select distinct empno, firstnme, lastname, phoneno + from employee, department + where employee.workdept = department.deptno + and deptname = 'OPERATIONS' + and age > 17; + ''' + }, + + { + 'id': 11, + 'name': 'Subquery to Join Match 3', + 'pattern': ''' + select e.empno, e.firstnme, e.lastname, e.phoneno + from employee e + where e.workdept in + (select d.deptno + from department d + where d.deptname = 'OPERATIONS') + and e.age > 17; + ''', + 'rewrite': ''' + select distinct e.empno, e.firstnme, e.lastname, e.phoneno + from employee e, department d + where e.workdept = d.deptno + and d.deptname = 'OPERATIONS' + and e.age > 17; + ''' + }, + + { + 'id': 12, + 'name': 'Join to Filter Match 1', + 'pattern': ''' + SELECT * + FROM blc_admin_permission adminpermi0_ + INNER JOIN blc_admin_role_permission_xref allroles1_ + ON adminpermi0_.admin_permission_id = + allroles1_.admin_permission_id + INNER JOIN blc_admin_role adminrolei2_ + ON allroles1_.admin_role_id = adminrolei2_.admin_role_id + WHERE adminrolei2_.admin_role_id = 1 + AND 1=1; + ''', + 'rewrite': ''' + SELECT * + FROM blc_admin_permission AS adminpermi0_ + INNER JOIN blc_admin_role_permission_xref AS allroles1_ + ON adminpermi0_.admin_permission_id = + allroles1_.admin_permission_id + WHERE allroles1_.admin_role_id = 1 + AND 1=1; + ''' + }, + + { + 'id': 13, + 'name': 'Join to Filter Match 2', + 'pattern': ''' + SELECT Count(adminpermi0_.admin_permission_id) AS col_0_0_ + FROM blc_admin_permission adminpermi0_ + INNER JOIN blc_admin_role_permission_xref allroles1_ + ON adminpermi0_.admin_permission_id = + allroles1_.admin_permission_id + INNER JOIN blc_admin_role adminrolei2_ + ON allroles1_.admin_role_id = adminrolei2_.admin_role_id + WHERE adminpermi0_.is_friendy = 1 + AND adminrolei2_.admin_role_id = 1; + ''', + 'rewrite': ''' + SELECT Count(adminpermi0_.admin_permission_id) AS col_0_0_ + FROM blc_admin_permission AS adminpermi0_ + INNER JOIN blc_admin_role_permission_xref AS allroles1_ + ON adminpermi0_.admin_permission_id = + allroles1_.admin_permission_id + WHERE allroles1_.admin_role_id = 1 + AND adminpermi0_.is_friendy = 1; + ''' + }, + + { + 'id': 14, + 'name': 'Test Rule Wetune 90 Match', + 'pattern': ''' + SELECT adminpermi0_.admin_permission_id AS admin_pe1_4_, + adminpermi0_.description AS descript2_4_, + adminpermi0_.is_friendly AS is_frien3_4_, + adminpermi0_.name AS name4_4_, + adminpermi0_.permission_type AS permissi5_4_ + FROM blc_admin_permission adminpermi0_ + INNER JOIN blc_admin_role_permission_xref allroles1_ ON adminpermi0_.admin_permission_id = allroles1_.admin_permission_id + INNER JOIN blc_admin_role adminrolei2_ ON allroles1_.admin_role_id = adminrolei2_.admin_role_id + WHERE adminpermi0_.is_friendly = 1 + AND adminrolei2_.admin_role_id = 1 + ORDER BY adminpermi0_.description ASC + LIMIT 50 + ''', + 'rewrite': ''' + SELECT adminpermi0_.admin_permission_id AS admin_pe1_4_, + adminpermi0_.description AS descript2_4_, + adminpermi0_.is_friendly AS is_frien3_4_, + adminpermi0_.name AS name4_4_, + adminpermi0_.permission_type AS permissi5_4_ + FROM blc_admin_permission adminpermi0_ + INNER JOIN blc_admin_role_permission_xref allroles1_ ON adminpermi0_.admin_permission_id = allroles1_.admin_permission_id + WHERE adminpermi0_.is_friendly = 1 + AND allroles1_.admin_role_id = 1 + ORDER BY adminpermi0_.description ASC + LIMIT 50 + ''' + }, + + { + 'id': 15, + 'name': 'Test Rule Calcite PushMinThroughUnion', + 'pattern': ''' + SELECT t.ENAME, + MIN(t.EMPNO) + FROM + (SELECT * + FROM EMP AS EMP + UNION ALL SELECT * + FROM EMP AS EMP) AS t + GROUP BY t.ENAME + ''', + 'rewrite': ''' + SELECT t6.ENAME, MIN(MIN(EMP.EMPNO)) + FROM (SELECT EMP.ENAME, MIN(EMP.EMPNO) + FROM EMP + GROUP BY EMP.ENAME + UNION ALL SELECT EMP.ENAME, MIN(EMP.EMPNO) + FROM EMP + GROUP BY EMP.ENAME) AS t6 + GROUP BY t6.ENAME + ''' + }, + + { + 'id': 16, + 'name': 'Remove Max Distinct', + 'pattern': ''' + SELECT A, MAX(DISTINCT (SELECT B FROM R WHERE C = 0)), D + FROM S; + ''', + 'rewrite': ''' + SELECT A, MAX((SELECT B FROM R WHERE C = 0)), D + FROM S; + ''' + }, + + { + 'id': 17, + 'name': 'Remove 1 Useless InnerJoin', + 'pattern': ''' + SELECT o_auth_applications.id + FROM o_auth_applications + INNER JOIN authorizations + ON o_auth_applications.id = authorizations.o_auth_application_id + WHERE authorizations.user_id = 1465 + ''', + 'rewrite': ''' + SELECT authorizations.o_auth_application_id + FROM authorizations + WHERE authorizations.user_id = 1465 + ''' + }, + + { + 'id': 18, + 'name': 'Stackoverflow 1', + 'pattern': ''' + SELECT DISTINCT my_table.foo, your_table.boo + FROM my_table, your_table + WHERE my_table.num = 1 OR your_table.num = 2 + ''', + 'rewrite': ''' + SELECT + my_table.foo, + your_table.boo + FROM + my_table, + your_table + WHERE + my_table.num = 1 + OR your_table.num = 2 + GROUP BY + my_table.foo, + your_table.boo + ''' + }, + + { + 'id': 19, + 'name': 'Partial Matching Base Case 1', + 'pattern': ''' + SELECT * + FROM A a + LEFT JOIN B b ON a.id = b.cid + WHERE + b.cl1 = 's1' OR b.cl1 ='s2' + ''', + 'rewrite': ''' + SELECT * + FROM A a + LEFT JOIN B b ON a.id = b.cid + WHERE + b.cl1 IN ('s1', 's2') + ''' + }, + + { + 'id': 20, + 'name': 'Partial Matching Base Case 2', + 'pattern': ''' + SELECT * + FROM b + WHERE + b.cl1 IN ('s1', 's2') OR b.cl1 ='s3' + ''', + 'rewrite': ''' + SELECT * + FROM b + WHERE + b.cl1 IN ('s3', 's1', 's2') + ''' + }, + + { + 'id': 21, + 'name': 'Partial Matching 0', + 'pattern': ''' + SELECT * + FROM A a + LEFT JOIN B b ON a.id = b.cid + WHERE + b.cl1 = 's1' OR b.cl1 = 's2' OR b.cl1 = 's3' + ''', + 'rewrite': ''' + SELECT * + FROM A a + LEFT JOIN B b ON a.id = b.cid + WHERE + b.cl1 IN ('s1', 's2') OR b.cl1 = 's3' + ''' + }, + + { + 'id': 22, + 'name': 'Partial Matching 4', + 'pattern': ''' + select empno, firstname, lastname, phoneno + from employee + where workdept in + (select deptno + from department + where deptname = 'OPERATIONS') + and firstname like 'B%' + ''', + 'rewrite': ''' + select distinct empno, firstname, lastname, phoneno + from employee, department + where employee.workdept = department.deptno + and deptname = 'OPERATIONS' + and firstname like 'B%' + ''' + }, + + { + 'id': 23, + 'name': 'Partial Keeps Remaining OR', + 'pattern': ''' + SELECT entities.data + FROM entities + WHERE entities._id IN (SELECT index_users_email._id + FROM index_users_email + WHERE index_users_email.key = 'test') + OR entities._id IN (SELECT index_users_profile_name._id + FROM index_users_profile_name + WHERE index_users_profile_name.key = 'test') + ''', + 'rewrite': ''' + SELECT entities.data + FROM entities + INNER JOIN index_users_email ON index_users_email._id = entities._id + WHERE index_users_email.key = 'test' + OR entities._id IN (SELECT index_users_profile_name._id + FROM index_users_profile_name + WHERE index_users_profile_name.key = 'test') + ''' + }, + + { + 'id': 24, + 'name': 'Partial Keeps Remaining AND', + 'pattern': ''' + SELECT Empno + FROM EMP + WHERE EMPNO > 10 + AND EMPNO <= 10 + AND EMPNAME LIKE '%Jason%' + ''', + 'rewrite': ''' + SELECT Empno + FROM EMP + WHERE FALSE + AND EMPNAME LIKE '%Jason%' + ''' + }, + + { + 'id': 25, + 'name': 'And On True', + 'pattern': ''' + SELECT people.name + FROM people + WHERE 1 AND 1 + ''', + 'rewrite': ''' + SELECT people.name + FROM people + ''' + }, + + { + 'id': 26, + 'name': 'Multiple And On True', + 'pattern': ''' + SELECT name + FROM people + WHERE 1 = 1 AND 2 = 2 + ''', + 'rewrite': ''' + SELECT name + FROM people + ''' + }, + + { + 'id': 27, + 'name': 'Remove Where True', + 'pattern': ''' + SELECT * + FROM Emp + WHERE age > age - 2; + ''', + 'rewrite': ''' + SELECT * + FROM Emp + ''' + }, + + { + 'id': 28, + 'name': 'Rewrite Skips Failed Partial', + 'pattern': ''' + SELECT * + FROM accounts + WHERE LOWER(accounts.firstname) = LOWER('Sam') + AND accounts.id IN (SELECT addresses.account_id + FROM addresses + WHERE LOWER(addresses.name) = LOWER('Street1')) + AND accounts.id IN (SELECT alternate_ids.account_id + FROM alternate_ids + WHERE alternate_ids.alternate_id_glbl = '5'); + ''', + 'rewrite': ''' + SELECT * + FROM accounts + JOIN addresses ON accounts.id = addresses.account_id + JOIN alternate_ids ON accounts.id = alternate_ids.account_id + WHERE LOWER(accounts.firstname) = LOWER('Sam') + AND LOWER(addresses.name) = LOWER('Street1') + AND alternate_ids.alternate_id_glbl = '5'; + ''' + }, + + { + 'id': 29, + 'name': 'Full Matching', + 'pattern': ''' + SELECT entities.data FROM entities WHERE entities._id IN (SELECT index_users_email._id FROM index_users_email WHERE index_users_email.key = 'test') + UNION + SELECT entities.data FROM entities WHERE entities._id IN (SELECT index_users_profile_name._id FROM index_users_profile_name WHERE index_users_profile_name.key = 'test') + ''', + 'rewrite': ''' + SELECT entities.data FROM entities INNER JOIN index_users_email ON index_users_email._id = entities._id WHERE index_users_email.key = 'test' + UNION + SELECT entities.data FROM entities INNER JOIN index_users_profile_name ON index_users_profile_name._id = entities._id WHERE index_users_profile_name.key = 'test' + ''' + }, + + { + 'id': 30, + 'name': 'Over Partial Matching', + 'pattern': ''' + SELECT * FROM table_name WHERE (table_name.title = 1 and table_name.grade = 2) OR (table_name.title = 2 and table_name.debt = 2 and table_name.grade = 3) OR (table_name.prog = 1 and table_name.title =1 and table_name.debt = 3) + ''', + 'rewrite': ''' + SELECT * FROM table_name WHERE (table_name.title = 1 and table_name.grade = 2) OR (table_name.title = 2 and table_name.debt = 2 and table_name.grade = 3) OR (table_name.prog = 1 and table_name.title =1 and table_name.debt = 3) + ''' + }, + + { + 'id': 31, + 'name': 'Aggregation to Subquery', + 'pattern': ''' +SELECT + t1.CPF, + DATE(t1.data) AS data, + CASE WHEN SUM(CASE WHEN t1.login_ok = true + THEN 1 + ELSE 0 + END) >= 1 + THEN true + ELSE false + END +FROM db_risco.site_rn_login AS t1 +GROUP BY t1.CPF, DATE(t1.data) + ''', + 'rewrite': ''' +SELECT + t1.CPF, + t1.data +FROM ( + SELECT + CPF, + DATE(data) + FROM db_risco.site_rn_login + WHERE login_ok = true +) t1 +GROUP BY t1.CPF, t1.data + ''' + }, + + { + 'id': 32, + 'name': 'Spreadsheet ID 2', + 'pattern': ''' +SELECT * +FROM place +WHERE "select" = TRUE + OR exists (SELECT id + FROM bookmark + WHERE user IN (1,2,3,4) + AND bookmark.place = place.id) + LIMIT 10; + ''', + 'rewrite': ''' +SELECT * +FROM ( + (SELECT * + FROM place + WHERE "select" = True + LIMIT 10) +UNION + (SELECT * + FROM place + WHERE EXISTS + (SELECT 1 + FROM bookmark + WHERE user IN (1, 2, 3, 4) + AND bookmark.place = place.id) + LIMIT 10)) +LIMIT 10 + ''' + }, + + { + 'id': 33, + 'name': 'Spreadsheet ID 3', + 'pattern': ''' +SELECT EMPNO FROM EMP WHERE EMPNO > 10 AND EMPNO <= 10 + ''', + 'rewrite': ''' +SELECT EMPNO FROM EMP WHERE FALSE + ''' + }, + + { + 'id': 34, + 'name': 'Spreadsheet ID 7', + 'pattern': ''' +select * from +a +left join b on a.id = b.cid +where +b.cl1 = 's1' +or +b.cl1 ='s2' +or +b.cl1 ='s3' + ''', + 'rewrite': ''' +select * from +a +left join b on a.id = b.cid +where +b.cl1 in ('s1','s2','s3') + ''' + }, + + { + 'id': 35, + 'name': 'Spreadsheet ID 9', + 'pattern': ''' +SELECT DISTINCT my_table.foo +FROM my_table +WHERE my_table.num = 1; + ''', + 'rewrite': ''' +SELECT my_table.foo +FROM my_table +WHERE my_table.num = 1 +GROUP BY my_table.foo; + ''' + }, + + { + 'id': 36, + 'name': 'Spreadsheet ID 10', + 'pattern': ''' +SELECT table1.wpis_id +FROM table1 +WHERE table1.etykieta_id IN ( + SELECT table2.tag_id + FROM table2 + WHERE table2.postac_id = 376476 + ); + ''', + 'rewrite': ''' +SELECT table1.wpis_id +FROM table1 +INNER JOIN table2 on table2.tag_id = table1.etykieta_id +WHERE table2.postac_id = 376476 + ''' + }, + + { + 'id': 37, + 'name': 'Spreadsheet ID 11', + 'pattern': ''' +SELECT historicoestatusrequisicion_id, requisicion_id, estatusrequisicion_id, + comentario, fecha_estatus, usuario_id + FROM historicoestatusrequisicion hist1 + WHERE requisicion_id IN + ( + SELECT requisicion_id FROM historicoestatusrequisicion hist2 + WHERE usuario_id = 27 AND estatusrequisicion_id = 1 + ) + ORDER BY requisicion_id, estatusrequisicion_id + ''', + 'rewrite': ''' +SELECT hist1.historicoestatusrequisicion_id, hist1.requisicion_id, hist1.estatusrequisicion_id, hist1.comentario, hist1.fecha_estatus, hist1.usuario_id + FROM historicoestatusrequisicion hist1 + JOIN historicoestatusrequisicion hist2 ON hist2.requisicion_id = hist1.requisicion_id + WHERE hist2.usuario_id = 27 AND hist2.estatusrequisicion_id = 1 + ORDER BY hist1.requisicion_id, hist1.estatusrequisicion_id + ''' + }, + + { + 'id': 38, + 'name': 'Spreadsheet ID 12', + 'pattern': ''' +SELECT po.id, + SUM(grouped_items.total_quantity) AS order_total_quantity +FROM purchase_orders po +LEFT JOIN ( + SELECT items.purchase_order_id, + SUM(items.quantity) AS item_total + FROM items + GROUP BY items.purchase_order_id +) grouped_items ON po.id = grouped_items.purchase_order_id +WHERE po.shop_id = 195 +GROUP BY po.id + ''', + 'rewrite': ''' +SELECT po.id, + ( + SELECT SUM(items.quantity) + FROM items + WHERE items.purchase_order_id = po.id + GROUP BY items.purchase_order_id + ) AS order_total_quantity +FROM purchase_orders po +WHERE shop_id = 195 +GROUP BY po.id + ''' + }, + + { + 'id': 39, + 'name': 'Spreadsheet ID 15', + 'pattern': ''' +SELECT * +FROM users u +WHERE u.id IN + (SELECT s1.user_id + FROM sessions s1 + WHERE s1.user_id <> 1234 + AND (s1.ip IN + (SELECT s2.ip + FROM sessions s2 + WHERE s2.user_id = 1234 + GROUP BY s2.ip) + OR s1.cookie_identifier IN + (SELECT s3.cookie_identifier + FROM sessions s3 + WHERE s3.user_id = 1234 + GROUP BY s3.cookie_identifier)) + GROUP BY s1.user_id) + ''', + 'rewrite': ''' +SELECT * +FROM users u +WHERE EXISTS ( + SELECT + NULL + FROM sessions s1 + WHERE s1.user_id <> 1234 + AND u.id = s1.user_id + AND EXISTS ( + SELECT + NULL + FROM sessions s2 + WHERE s2.user_id = 1234 + AND (s1.ip = s2.ip + OR s1.cookie_identifier = s2.cookie_identifier + ) + ) + ) + ''' + }, + + { + 'id': 40, + 'name': 'Spreadsheet ID 18', + 'pattern': ''' +SELECT DISTINCT ON (t.playerId) t.gzpId, t.pubCode, t.playerId, + COALESCE (p.preferenceValue,'en'), + s.segmentId +FROM userPlayerIdMap t LEFT JOIN + userPreferences p + ON t.gzpId = p.gzpId LEFT JOIN + segment s + ON t.gzpId = s.gzpId +WHERE t.pubCode IN ('hyrmas','ayqioa','rj49as99') and + t.provider IN ('FCM','ONE_SIGNAL') and + s.segmentId IN (0,1,2,3,4,5,6) and + p.preferenceValue IN ('en','hi') +ORDER BY t.playerId desc; + ''', + 'rewrite': ''' +SELECT t.gzpId, t.pubCode, t.playerId, + COALESCE((SELECT p.preferenceValue + FROM userPreferences p + WHERE t.gzpId = p.gzpId AND + p.preferenceValue IN ('en', 'hi') + LIMIT 1 + ), 'en' + ), + (SELECT s.segmentId + FROM segment s + WHERE t.gzpId = s.gzpId AND + s.segmentId IN (0, 1, 2, 3, 4, 5, 6) + LIMIT 1 + ) +FROM userPlayerIdMap t +WHERE t.pubCode IN ('hyrmas', 'ayqioa', 'rj49as99') and + t.provider IN ('FCM', 'ONE_SIGNAL'); + ''' + }, + + { + 'id': 41, + 'name': 'Spreadsheet ID 20', + 'pattern': ''' +SELECT * FROM (SELECT * FROM (SELECT NULL FROM EMP) WHERE N IS NULL) WHERE N IS NULL + ''', + 'rewrite': ''' +SELECT NULL FROM EMP + ''' + }, + + { + 'id': 42, + 'name': 'PostgreSQL Test', + 'pattern': ''' + SELECT "tweets"."latitude" AS "latitude", + "tweets"."longitude" AS "longitude" + FROM "public"."tweets" "tweets" + WHERE (("tweets"."latitude" >= -90) AND ("tweets"."latitude" <= 80) + AND ((("tweets"."longitude" >= -173.80000000000001) AND ("tweets"."longitude" <= 180)) OR ("tweets"."longitude" IS NULL)) + AND (CAST((DATE_TRUNC( 'day', CAST("tweets"."created_at" AS DATE) ) + (-EXTRACT(DOW FROM "tweets"."created_at") * INTERVAL '1 DAY')) AS DATE) + = (TIMESTAMP '2018-04-22 00:00:00.000')) + AND (STRPOS(CAST(LOWER(CAST(CAST("tweets"."text" AS TEXT) AS TEXT)) AS TEXT),CAST('microsoft' AS TEXT)) > 0)) + GROUP BY 1, 2 + ''', + 'rewrite': ''' + SELECT "tweets"."latitude" AS "latitude", + "tweets"."longitude" AS "longitude" + FROM "public"."tweets" "tweets" + WHERE (("tweets"."latitude" >= -90) AND ("tweets"."latitude" <= 80) + AND ((("tweets"."longitude" >= -173.80000000000001) AND ("tweets"."longitude" <= 180)) OR ("tweets"."longitude" IS NULL)) + AND ((DATE_TRUNC( 'day', "tweets"."created_at" ) + (-EXTRACT(DOW FROM "tweets"."created_at") * INTERVAL '1 DAY')) + = (TIMESTAMP '2018-04-22 00:00:00.000')) + AND "tweets"."text" ILIKE '%microsoft%') + GROUP BY 1, 2 + ''' + }, + + { + 'id': 43, + 'name': 'MySQL Test', + 'pattern': ''' +SELECT `tweets`.`latitude` AS `latitude`, + `tweets`.`longitude` AS `longitude` + FROM `tweets` + WHERE ((ADDDATE(DATE_FORMAT(`tweets`.`created_at`, '%Y-%m-01 00:00:00'), INTERVAL 0 SECOND) = TIMESTAMP('2017-03-01 00:00:00')) + AND (LOCATE('iphone', LOWER(`tweets`.`text`)) > 0)) + GROUP BY 1, 2''', + 'rewrite': ''' +SELECT `tweets`.`latitude` AS `latitude`, + `tweets`.`longitude` AS `longitude` + FROM `tweets` + WHERE ((DATE_FORMAT(`tweets`.`created_at`, '%Y-%m-01 00:00:00') = TIMESTAMP('2017-03-01 00:00:00')) + AND (LOCATE('iphone', LOWER(`tweets`.`text`)) > 0)) + GROUP BY 1, 2''' + } +] + + +def get_query(query_id: int) -> dict: + return next(filter(lambda x: x["id"] == query_id, queries), None) diff --git a/data/rules.py b/data/rules.py index 0161b6d..4fb3bbd 100644 --- a/data/rules.py +++ b/data/rules.py @@ -18,7 +18,8 @@ 'actions': '', # 'actions_json': '[]', # 'mapping': '{"x": "V1"}', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [16] }, { @@ -34,7 +35,8 @@ 'actions': '', # 'actions_json': "[]", # 'mapping': "{\"x\": \"V1\"}", - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [1, 2, 42] }, { @@ -45,7 +47,8 @@ 'constraints': 'TYPE(x)=TEXT', 'rewrite': '', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [42] }, { @@ -61,7 +64,8 @@ 'actions': '', # 'actions_json': "[]", # 'mapping': "{\"x\": \"V1\", \"y\": \"V2\"}", - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [4, 42] }, { @@ -88,7 +92,8 @@ 'actions': 'SUBSTITUTE(s1, t2, t1) and\n SUBSTITUTE(p1, t2, t1)', # 'actions_json': "[{\"function\": \"substitute\", \"variables\": [\"VL1\", \"V3\", \"V2\"]}, {\"function\": \"substitute\", \"variables\": [\"VL2\", \"V3\", \"V2\"]}]", # 'mapping': "{\"s1\": \"VL1\", \"p1\": \"VL2\", \"tb1\": \"V1\", \"t1\": \"V2\", \"t2\": \"V3\", \"a1\": \"V4\"}", - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [6, 8, 9] }, { @@ -110,7 +115,8 @@ and <> ''', 'actions': 'SUBSTITUTE(s1, t2, t1) and\n SUBSTITUTE(p1, t2, t1)', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [6, 8, 9] }, { @@ -132,7 +138,8 @@ and <> ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [9, 10, 11, 22] }, { @@ -156,7 +163,8 @@ and <> ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [12, 13] }, { @@ -180,7 +188,8 @@ and <> ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [] }, { @@ -200,7 +209,8 @@ WHERE . = ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [] }, { @@ -222,7 +232,8 @@ AND . = ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [] }, { @@ -244,7 +255,8 @@ AND <> ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [] }, { @@ -264,7 +276,8 @@ WHERE ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [17] }, { @@ -275,7 +288,8 @@ 'constraints': '', 'rewrite': 'FROM ', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [27] }, { @@ -286,7 +300,8 @@ 'constraints': '', 'rewrite': 'SELECT . FROM INNER JOIN ON . = . WHERE . = ', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [23, 28, 31, 32, 29] }, { @@ -297,7 +312,8 @@ 'constraints': '', 'rewrite': 'SELECT <> FROM <> WHERE False', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [24] }, { @@ -318,7 +334,8 @@ AND <> AND <>''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [28] }, { @@ -329,7 +346,8 @@ 'constraints': '', 'rewrite': '''SELECT ., . FROM (SELECT , DATE() FROM WHERE = ) AS GROUP BY <>, .''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [31] }, { @@ -340,7 +358,8 @@ 'constraints': '', 'rewrite': '''SELECT <> FROM ((SELECT <> FROM WHERE LIMIT ) UNION (SELECT <> FROM WHERE EXISTS (SELECT FROM WHERE IN (, , , ) AND <>) LIMIT )) LIMIT ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [32] }, { @@ -351,7 +370,8 @@ 'constraints': '', 'rewrite': '''FALSE''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [33] }, { @@ -362,7 +382,8 @@ 'constraints': '', 'rewrite': '''SELECT <> FROM WHERE . IN (SELECT <> FROM WHERE <>) UNION SELECT <> FROM WHERE . IN (SELECT <> FROM WHERE <>)''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [38] }, { @@ -373,7 +394,8 @@ 'constraints': '', 'rewrite': '''1 = CASE WHEN THEN 1 WHEN THEN 1 WHEN THEN 1 ELSE 0 END''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [39] }, { @@ -384,7 +406,8 @@ 'constraints': '', 'rewrite': '''. IN ('', '', '')''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [34] }, { @@ -395,7 +418,8 @@ 'constraints': '', 'rewrite': '''SELECT FROM WHERE <> GROUP BY ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [35] }, { @@ -406,7 +430,8 @@ 'constraints': '', 'rewrite': '''FROM INNER JOIN ON . = . WHERE <>''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [36] }, { @@ -417,7 +442,8 @@ 'constraints': '', 'rewrite': '''SELECT ., ., ., ., ., . FROM JOIN ON . = . WHERE . = AND . = ORDER BY ., .''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [37] }, { @@ -428,7 +454,8 @@ 'constraints': '', 'rewrite': '''SELECT <>, (SELECT FROM WHERE . = . GROUP BY ) AS FROM WHERE = ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [38] }, { @@ -439,7 +466,8 @@ 'constraints': '', 'rewrite': '''EXISTS (SELECT NULL FROM WHERE <> AND . = . AND EXISTS (SELECT NULL FROM WHERE <> AND (. = . OR . = .)))''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [39] }, { @@ -450,7 +478,8 @@ 'constraints': '', 'rewrite': '''SELECT , , , COALESCE((SELECT . FROM WHERE <> AND <> LIMIT 1), ), (SELECT <> FROM WHERE <> AND . IN (, , , , , , ) LIMIT ) FROM WHERE AND ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [40] }, { @@ -461,7 +490,8 @@ 'constraints': '', 'rewrite': '''SELECT NULL FROM ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [41] }, { @@ -489,7 +519,8 @@ LIMIT ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [14] }, { @@ -525,7 +556,8 @@ LIMIT 50 ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [14] }, { @@ -553,7 +585,8 @@ GROUP BY t6. ''', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [15] }, # MySQL Rules @@ -571,7 +604,8 @@ 'actions': '', # 'actions_json': "[]", # 'mapping': "{\"x\": \"V1\"}", - 'database': 'mysql' + 'database': 'mysql', + 'examples': [43] }, { @@ -587,7 +621,8 @@ 'actions': '', # 'actions_json': "[]", # 'mapping': "{\"x\": \"V1\", \"y\": \"V2\"}", - 'database': 'mysql' + 'database': 'mysql', + 'examples': [43] }, { @@ -598,7 +633,8 @@ 'constraints': '', 'rewrite': 'SELECT <> FROM <> WHERE <> GROUP BY <>', 'actions': '', - 'database': 'postgresql' + 'database': 'postgresql', + 'examples': [18] }, { 'id': 2258, @@ -608,7 +644,8 @@ 'constraints': '', 'rewrite': ' IN (, )', 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [19, 21, 23, 34] }, { 'id': 2280, @@ -618,7 +655,8 @@ 'constraints': '', 'rewrite': ' IN (, , )', 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [30] }, { 'id': 2259, @@ -628,7 +666,8 @@ 'constraints': '', 'rewrite': ' IN (<>, )', 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [20] }, { 'id': 2260, @@ -638,7 +677,8 @@ 'constraints': '', 'rewrite': ' IN (<>, <>)', 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [] }, { "id": 2261, @@ -648,7 +688,8 @@ 'constraints': '', "rewrite": " IN (<>, <>)", 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [] }, { "id": 2262, @@ -658,7 +699,8 @@ 'constraints': '', "rewrite": "SELECT DISTINCT , , , FROM , WHERE . = . AND <>", 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [22] }, { "id": 2263, @@ -668,7 +710,8 @@ 'constraints': '', "rewrite": "FROM ", 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [25] }, { "id": 2264, @@ -678,7 +721,8 @@ 'constraints': '', "rewrite": "FROM ", 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [26] }, { "id": 2265, @@ -688,7 +732,8 @@ 'constraints': '', "rewrite": "SELECT <> FROM WHERE . IN (SELECT <> FROM WHERE <>) UNION SELECT <> FROM WHERE . IN (SELECT <> FROM WHERE <>)", 'actions': '', - 'database': 'mysql' + 'database': 'mysql', + 'examples': [38] } ] @@ -712,7 +757,8 @@ def get_rule(key: str) -> dict: 'actions': rule['actions'], 'actions_json': json.loads(rule['actions_json']), 'mapping': json.loads(rule['mapping']), - 'database': rule['database'] + 'database': rule['database'], + 'examples': rule['examples'] } # return a list of rules (json attributes are in str) diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py new file mode 100644 index 0000000..8b176f9 --- /dev/null +++ b/tests/test_query_parser.py @@ -0,0 +1,297 @@ +import mo_sql_parsing as mosql +from core.query_parser import QueryParser +from core.ast.node import ( + QueryNode, SelectNode, FromNode, WhereNode, TableNode, ColumnNode, + LiteralNode, OperatorNode, FunctionNode, GroupByNode, HavingNode, + OrderByNode, LimitNode, OffsetNode, SubqueryNode, VarNode, VarSetNode +) +from core.ast.node_type import NodeType +from data.queries import get_query + +parser = QueryParser() + +def test_parse_1(): + query = get_query(1) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check SELECT clause + + # select_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.SELECT: + # select_clause = child + # break + + # assert select_clause is not None + # assert len(select_clause.children) == 2 + + # Check FROM clause + # from_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.FROM: + # from_clause = child + # break + + # assert from_clause is not None + # table_node = next(iter(from_clause.children)) + # assert isinstance(table_node, TableNode) + # assert table_node.name == "tweets" + + # Check WHERE clause + # where_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.WHERE: + # where_clause = child + # break + + # assert where_clause is not None + # assert len(where_clause.children) == 1 + + # Check GROUP BY clause + # group_by_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.GROUP_BY: + # group_by_clause = child + # break + + # assert group_by_clause is not None + # assert len(group_by_clause.children) == 1 + + +def test_parse_2(): + query = get_query(6) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check FROM clause has multiple tables + # from_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.FROM: + # from_clause = child + # break + + # assert from_clause is not None + # assert len(from_clause.children) == 2 + + # Check WHERE clause has multiple conditions + # where_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.WHERE: + # where_clause = child + # break + + # assert where_clause is not None + # condition = next(iter(where_clause.children)) + # assert isinstance(condition, OperatorNode) + + +def test_parse_3(): + query = get_query(9) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check WHERE clause has IN with subquery + # where_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.WHERE: + # where_clause = child + # break + + # assert where_clause is not None + # condition = next(iter(where_clause.children)) + # assert isinstance(condition, OperatorNode) + # assert condition.name == "AND" + + +def test_parse_4(): + query = get_query(12) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check FROM clause has multiple JOINs + # from_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.FROM: + # from_clause = child + # break + + # assert from_clause is not None + # Check for JOIN nodes in the FROM clause + # join_count = 0 + # for child in from_clause.children: + # if hasattr(child, 'type') and 'JOIN' in str(child.type): + # join_count += 1 + # assert join_count >= 2 + + +def test_parse_5(): + query = get_query(16) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check SELECT clause has aggregation with subquery + # select_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.SELECT: + # select_clause = child + # break + + # assert select_clause is not None + # assert len(select_clause.children) == 3 + + # Check for MAX function + # for child in select_clause.children: + # if isinstance(child, FunctionNode) and child.name == "MAX": + # assert True + # break + + +def test_parse_6(): + query = get_query(18) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check SELECT clause has DISTINCT + # select_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.SELECT: + # select_clause = child + # break + + # assert select_clause is not None + # Check for DISTINCT keyword + # assert hasattr(select_clause, 'distinct') and select_clause.distinct + + # Check FROM clause has multiple tables + # from_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.FROM: + # from_clause = child + # break + + # assert from_clause is not None + # assert len(from_clause.children) == 2 + + +def test_parse_7(): + query = get_query(25) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check WHERE clause has boolean logic + # where_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.WHERE: + # where_clause = child + # break + + # assert where_clause is not None + # condition = next(iter(where_clause.children)) + # assert isinstance(condition, OperatorNode) + # assert condition.name == "AND" + + +def test_parse_8(): + query = get_query(29) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check for UNION operation (this query has UNION) + # Check if the query contains UNION + # assert 'UNION' in sql.upper() + + # Check for subqueries in WHERE clause + # where_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.WHERE: + # where_clause = child + # break + + # assert where_clause is not None + + +def test_parse_9(): + query = get_query(31) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check SELECT clause has complex aggregation + # select_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.SELECT: + # select_clause = child + # break + + # assert select_clause is not None + # assert len(select_clause.children) == 3 + + # Check for CASE statement + # for child in select_clause.children: + # if isinstance(child, FunctionNode) and child.name == "CASE": + # assert True + # break + + # Check GROUP BY clause + # group_by_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.GROUP_BY: + # group_by_clause = child + # break + + # assert group_by_clause is not None + + +def test_parse_10(): + query = get_query(42) + sql = query['pattern'] + + qb_ast = parser.parse(sql) + # assert isinstance(qb_ast, QueryNode) + + # Check SELECT clause + # select_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.SELECT: + # select_clause = child + # break + + # assert select_clause is not None + # assert len(select_clause.children) == 2 + + # Check WHERE clause has complex conditions + # where_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.WHERE: + # where_clause = child + # break + + # assert where_clause is not None + + # Check GROUP BY clause + # group_by_clause = None + # for child in qb_ast.children: + # if child.type == NodeType.GROUP_BY: + # group_by_clause = child + # break + + # assert group_by_clause is not None + # assert len(group_by_clause.children) == 2 \ No newline at end of file