From 406b9fdff6fea65ac3308d5f2f420227cd3d7412 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 18 Jan 2023 19:49:16 -0600 Subject: [PATCH 01/12] introduce get_derivative_coeff_dict_at_source counterpart for target --- sumpy/expansion/multipole.py | 7 --- sumpy/kernel.py | 105 +++++++++++++++++------------------ 2 files changed, 50 insertions(+), 62 deletions(-) diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py index d63a094b4..4ceaa0a58 100644 --- a/sumpy/expansion/multipole.py +++ b/sumpy/expansion/multipole.py @@ -90,17 +90,10 @@ def coefficients_from_source(self, kernel, avec, bvec, rscale, sac=None): rscale, (1,), sac=sac) def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): - from sumpy.derivative_taker import DifferentiatedExprDerivativeTaker if not self.use_rscale: rscale = 1 base_taker = kernel.get_derivative_taker(bvec, rscale, sac) - # Following is a no-op, but AxisTargetDerivative.postprocess_at_target and - # DirectionalTargetDerivative.postprocess_at_target only handles - # DifferentiatedExprDerivativeTaker and sympy expressions, so we need to - # make the taker a DifferentitatedExprDerivativeTaker instance. - base_taker = DifferentiatedExprDerivativeTaker(base_taker, - {tuple([0]*self.dim): 1}) taker = kernel.postprocess_at_target(base_taker, bvec) result = [] diff --git a/sumpy/kernel.py b/sumpy/kernel.py index 1be1dec93..8549e9c1c 100644 --- a/sumpy/kernel.py +++ b/sumpy/kernel.py @@ -249,7 +249,17 @@ def postprocess_at_target(self, expr, bvec): :arg expr: may be a :class:`sympy.core.expr.Expr` or a :class:`sumpy.derivative_taker.DifferentiatedExprDerivativeTaker`. """ - return expr + from sumpy.derivative_taker import (ExprDerivativeTaker, + DifferentiatedExprDerivativeTaker) + expr_dict = {(0,)*self.dim: 1} + expr_dict = self.get_derivative_coeff_dict_at_target(expr_dict) + if isinstance(expr, ExprDerivativeTaker): + return DifferentiatedExprDerivativeTaker(expr, expr_dict) + + result = 0 + for mi, coeff in expr_dict.items(): + result += coeff * self._diff(expr, bvec, mi) + return result def get_derivative_coeff_dict_at_source(self, expr_dict): r"""Get the derivative transformation of the expression at source @@ -263,6 +273,18 @@ def get_derivative_coeff_dict_at_source(self, expr_dict): """ return expr_dict + def get_derivative_coeff_dict_at_target(self, expr_dict): + r"""Get the derivative transformation of the expression at target + represented by the dictionary expr_dict which is mapping from multi-index + `mi` to coefficient `coeff`. + Expression represented by the dictionary `expr_dict` is + :math:`\sum_{mi} \frac{\partial^mi}{x^mi}G * coeff`. Returns an + expression of the same type. + + This function is meant to be overridden by child classes where necessary. + """ + return expr_dict + def get_global_scaling_const(self): r"""Return a global scaling constant of the kernel. Typically, this ensures that the kernel is scaled so that @@ -959,8 +981,8 @@ def get_expression(self, scaled_dist_vec): def get_derivative_coeff_dict_at_source(self, expr_dict): return self.inner_kernel.get_derivative_coeff_dict_at_source(expr_dict) - def postprocess_at_target(self, expr, bvec): - return self.inner_kernel.postprocess_at_target(expr, bvec) + def get_derivative_coeff_dict_at_target(self, expr_dict): + return self.inner_kernel.get_derivative_coeff_dict_at_target(expr_dict) def get_global_scaling_const(self): return self.inner_kernel.get_global_scaling_const() @@ -1043,23 +1065,19 @@ def __str__(self): def __repr__(self): return f"AxisTargetDerivative({self.axis}, {self.inner_kernel!r})" - def postprocess_at_target(self, expr, bvec): - from sumpy.derivative_taker import (DifferentiatedExprDerivativeTaker, - diff_derivative_coeff_dict) + def get_derivative_coeff_dict_at_target(self, expr_dict): from sumpy.symbolic import make_sym_vector as make_sympy_vector - target_vec = make_sympy_vector(self.target_array_name, self.dim) - # bvec = tgt - ctr - expr = self.inner_kernel.postprocess_at_target(expr, bvec) - if isinstance(expr, DifferentiatedExprDerivativeTaker): - transformation = diff_derivative_coeff_dict(expr.derivative_coeff_dict, - self.axis, target_vec) - return DifferentiatedExprDerivativeTaker(expr.taker, transformation) - else: - # Since `bvec` and `tgt` are two different symbolic variables - # need to differentiate by both to get the correct answer - return expr.diff(bvec[self.axis]) + expr.diff(target_vec[self.axis]) + expr_dict = self.inner_kernel.get_derivative_coeff_dict_at_target( + expr_dict) + result = defaultdict(lambda: 0) + for mi, coeff in expr_dict.items(): + new_mi = list(mi) + new_mi[self.axis] += 1 + result[tuple(new_mi)] += coeff + result[mi] += sym.diff(coeff, target_vec[self.axis]) + return dict(result) def replace_base_kernel(self, new_base_kernel): return type(self)(self.axis, @@ -1141,35 +1159,23 @@ def transform(expr): return transform - def postprocess_at_target(self, expr, bvec): - from sumpy.derivative_taker import (DifferentiatedExprDerivativeTaker, - diff_derivative_coeff_dict) - + def get_derivative_coeff_dict_at_target(self, expr_dict): from sumpy.symbolic import make_sym_vector as make_sympy_vector dir_vec = make_sympy_vector(self.dir_vec_name, self.dim) target_vec = make_sympy_vector(self.target_array_name, self.dim) - expr = self.inner_kernel.postprocess_at_target(expr, bvec) + expr_dict = self.inner_kernel.get_derivative_coeff_dict_at_target( + expr_dict) - # bvec = tgt - center - if not isinstance(expr, DifferentiatedExprDerivativeTaker): - result = 0 + # avec = tgt - center + result = defaultdict(lambda: 0) + for mi, coeff in expr_dict.items(): for axis in range(self.dim): - # Since `bvec` and `tgt` are two different symbolic variables - # need to differentiate by both to get the correct answer - result += (expr.diff(bvec[axis]) + expr.diff(target_vec[axis])) \ - * dir_vec[axis] - return result - - new_transformation = defaultdict(lambda: 0) - for axis in range(self.dim): - axis_transformation = diff_derivative_coeff_dict( - expr.derivative_coeff_dict, axis, target_vec) - for mi, coeff in axis_transformation.items(): - new_transformation[mi] += coeff * dir_vec[axis] - - return DifferentiatedExprDerivativeTaker(expr.taker, - dict(new_transformation)) + new_mi = list(mi) + new_mi[axis] += 1 + result[tuple(new_mi)] += coeff * dir_vec[axis] + result[mi] += sym.diff(coeff, target_vec[axis]) + return result def get_args(self): return [ @@ -1268,25 +1274,14 @@ def replace_base_kernel(self, new_base_kernel): def replace_inner_kernel(self, new_inner_kernel): return type(self)(self.axis, new_inner_kernel) - def postprocess_at_target(self, expr, avec): + def get_derivative_coeff_dict_at_target(self, expr_dict): from sumpy.symbolic import make_sym_vector as make_sympy_vector - from sumpy.derivative_taker import (ExprDerivativeTaker, - DifferentiatedExprDerivativeTaker) - - expr = self.inner_kernel.postprocess_at_target(expr, avec) target_vec = make_sympy_vector(self.target_array_name, self.dim) - zeros = tuple([0]*self.dim) - mult = target_vec[self.axis] + expr_dict = self.inner_kernel.get_derivative_coeff_dict_at_target( + expr_dict) - if isinstance(expr, DifferentiatedExprDerivativeTaker): - transform = {mi: coeff * mult for mi, coeff in - expr.derivative_coeff_dict.items()} - return DifferentiatedExprDerivativeTaker(expr.taker, transform) - elif isinstance(expr, ExprDerivativeTaker): - return DifferentiatedExprDerivativeTaker({zeros: mult}) - else: - return mult * expr + return {mi: coeff*target_vec[self.axis] for mi, coeff in expr_dict.items()} def get_code_transformer(self): from sumpy.codegen import VectorComponentRewriter From e02c18a31c3eec60c0888ff4deccea0bed628608 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 17 May 2023 14:47:36 -0500 Subject: [PATCH 02/12] Optimize L2P for GPUs --- sumpy/e2p.py | 178 +++++++++++++++------ sumpy/expansion/local.py | 8 + sumpy/expansion/loopy.py | 333 ++++++++++++++++++++++++++++++++++++++- sumpy/fmm.py | 2 + sumpy/symbolic.py | 1 + sumpy/toys.py | 1 + test/test_kernels.py | 2 + 7 files changed, 474 insertions(+), 51 deletions(-) diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 0ec1c48b5..2801f8924 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -21,9 +21,12 @@ """ from abc import ABC, abstractmethod +from pytools import memoize_method import numpy as np import loopy as lp +from loopy.kernel.data import LocalInameTag +import pymbolic.primitives as prim from sumpy.tools import KernelCacheMixin, gather_loopy_arguments from loopy.version import MOST_RECENT_LANGUAGE_VERSION @@ -70,7 +73,7 @@ def __init__(self, ctx, expansion, kernels, self.context = ctx self.expansion = expansion - self.kernels = kernels + self.kernels = tuple(kernels) self.name = name or self.default_name self.device = device @@ -81,15 +84,18 @@ def __init__(self, ctx, expansion, kernels, def default_name(self): pass + @memoize_method + def get_cached_loopy_knl_and_optimizations(self): + return self.expansion.loopy_evaluator(self.kernels) + def get_cache_key(self): return (type(self).__name__, self.expansion, tuple(self.kernels)) def add_loopy_eval_callable( self, loopy_knl: lp.TranslationUnit) -> lp.TranslationUnit: - inner_knl = self.expansion.loopy_evaluator(self.kernels) + inner_knl, _ = self.get_cached_loopy_knl_and_optimizations() loopy_knl = lp.merge([loopy_knl, inner_knl]) loopy_knl = lp.inline_callable_kernel(loopy_knl, "e2p") - loopy_knl = lp.remove_unused_inames(loopy_knl) for kernel in self.kernels: loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") @@ -117,23 +123,27 @@ class E2PFromSingleBox(E2PBase): def default_name(self): return "e2p_from_single_box" - def get_kernel(self): + def get_kernel(self, max_ntargets_in_one_box): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() + max_work_items = min(32, max(ncoeffs, max_ntargets_in_one_box)) loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] + <> tgt_ibox = target_boxes[itgt_box] {id=fetch_init0} + <> itgt_start = box_target_starts[tgt_ibox] {id=fetch_init1} + <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] \ + {id=fetch_init2} <> center[idim] = centers[idim, tgt_ibox] {id=fetch_center} @@ -141,9 +151,13 @@ def get_kernel(self): src_expansions[tgt_ibox - src_base_ibox, icoeff] \ {id=fetch_coeffs} - for itgt - <> tgt[idim] = targets[idim, itgt] {id=fetch_tgt,dup=idim} - <> result_temp[iknl] = 0 {id=init_result,dup=iknl} + for itgt_offset + <> itgt = itgt_start + itgt_offset + <> run_itgt = itgt tgt[idim] = targets[idim, itgt] {id=fetch_tgt, \ + dup=idim,if=run_itgt} + <> result_temp[iknl] = 0 {id=init_result,dup=iknl, \ + if=run_itgt} [iknl]: result_temp[iknl] = e2p( [iknl]: result_temp[iknl], [icoeff]: coeffs[icoeff], @@ -155,9 +169,9 @@ def get_kernel(self): targets, """ + ",".join(arg.name for arg in loopy_args) + """ ) {dep=fetch_coeffs:fetch_center:init_result:fetch_tgt,\ - id=update_result} + id=update_result,if=run_itgt} result[iknl, itgt] = result_temp[iknl] * kernel_scaling \ - {id=write_result,dep=update_result} + {id=write_result,dep=update_result,if=run_itgt} end end """], @@ -182,7 +196,9 @@ def get_kernel(self): silenced_warnings="write_race(*_result)", default_offset=lp.auto, fixed_parameters={"dim": self.dim, "nresults": len(self.kernels), - "ncoeffs": ncoeffs}, + "ncoeffs": ncoeffs, + "max_work_items": max_work_items, + "max_ntargets_in_one_box": max_ntargets_in_one_box}, lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -191,13 +207,39 @@ def get_kernel(self): return loopy_knl - def get_optimized_kernel(self): - # FIXME - knl = self.get_kernel() + def get_optimized_kernel(self, max_ntargets_in_one_box): + inner_knl, optimizations = self.get_cached_loopy_knl_and_optimizations() + knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) knl = lp.tag_inames(knl, {"itgt_box": "g.0"}) + knl = lp.split_iname(knl, "itgt_offset", 32, inner_tag="l.0") + knl = lp.split_iname(knl, "icoeff", 32, inner_tag="l.0") + knl = lp.add_inames_to_insn(knl, "dummy", + "id:fetch_init* or id:fetch_center or id:kernel_scaling") knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling") + knl = lp.tag_inames(knl, {"dummy": "l.0"}) + knl = lp.set_temporary_address_space(knl, "coeffs", lp.AddressSpace.LOCAL) knl = lp.set_options(knl, - enforce_variable_access_ordered="no_check") + enforce_variable_access_ordered="no_check", write_code=False) + + for transform in optimizations: + knl = transform(knl) + + # If there are inames tagged as local in the inner kernel + # we need to remove the iname itgt_offset_inner from instructions + # within those inames and also remove the predicate run_itgt + # which depends on itgt_offset_inner + tagged_inames = [iname.name for iname in + knl.default_entrypoint.inames.values() if + iname.name.startswith("e2p_") and any( + isinstance(tag, LocalInameTag) for tag in iname.tags)] + if tagged_inames: + insn_ids = [insn.id for insn in knl.default_entrypoint.instructions + if any(iname in tagged_inames for iname in insn.within_inames)] + match = " or ".join(f"id:{insn_id}" for insn_id in insn_ids) + knl = lp.remove_inames_from_insn(knl, + frozenset(["itgt_offset_inner"]), match) + knl = lp.remove_predicates_from_insn(knl, + frozenset([prim.Variable("run_itgt")]), match) return knl @@ -210,7 +252,9 @@ def __call__(self, queue, **kwargs): :arg centers: :arg targets: """ - knl = self.get_cached_kernel_executor() + max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box") + knl = self.get_cached_kernel_executor( + max_ntargets_in_one_box=max_ntargets_in_one_box) centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type @@ -229,42 +273,49 @@ class E2PFromCSR(E2PBase): def default_name(self): return "e2p_from_csr" - def get_kernel(self): + def get_kernel(self, max_ntargets_in_one_box): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() + max_work_items = min(32, max(ncoeffs, max_ntargets_in_one_box)) loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] - - for itgt - <> tgt[idim] = targets[idim,itgt] {id=fetch_tgt,dup=idim} - - <> isrc_box_start = source_box_starts[itgt_box] - <> isrc_box_end = source_box_starts[itgt_box+1] - - <> result_temp[iknl] = 0 {id=init_result,dup=iknl} - for isrc_box - <> src_ibox = source_box_lists[isrc_box] - <> coeffs[icoeff] = \ + <> tgt_ibox = target_boxes[itgt_box] {id=init_box0} + <> itgt_start = box_target_starts[tgt_ibox] {id=init_box1} + <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] \ + {id=init_box2} + <> isrc_box_start = source_box_starts[itgt_box] {id=init_box3} + <> isrc_box_end = source_box_starts[itgt_box+1] {id=init_box4} + + <> result_temp[itgt_offset, iknl] = 0 \ + {id=init_result,dup=iknl} + for isrc_box + <> src_ibox = source_box_lists[isrc_box] {id=fetch_src_box} + <> coeffs[icoeff] = \ src_expansions[src_ibox - src_base_ibox, icoeff] \ - {id=fetch_coeffs,dup=icoeff} - <> center[idim] = centers[idim, src_ibox] \ + {id=fetch_coeffs} + <> center[idim] = centers[idim, src_ibox] \ {dup=idim,id=fetch_center} - [iknl]: result_temp[iknl] = e2p( - [iknl]: result_temp[iknl], + + for itgt_offset + <> itgt = itgt_start + itgt_offset + <> run_itgt = itgt tgt[idim] = targets[idim,itgt] \ + {id=fetch_tgt,dup=idim,if=run_itgt} + + [iknl]: result_temp[itgt_offset, iknl] = e2p( + [iknl]: result_temp[itgt_offset, iknl], [icoeff]: coeffs[icoeff], [idim]: center[idim], [idim]: tgt[idim], @@ -274,11 +325,18 @@ def get_kernel(self): targets, """ + ",".join(arg.name for arg in loopy_args) + """ ) {id=update_result, \ - dep=fetch_coeffs:fetch_center:fetch_tgt:init_result} + dep=fetch_coeffs:fetch_center:fetch_tgt:init_result, \ + if=run_itgt} end - result[iknl, itgt] = result[iknl, itgt] + result_temp[iknl] \ - * kernel_scaling \ - {dep=update_result:init_result,id=write_result,dup=iknl} + end + for itgt_offset + <> itgt2 = itgt_start + itgt_offset {id=init_itgt_for_write} + <> run_itgt2 = itgt_start + itgt_offset < itgt_end \ + {id=init_cond_for_write} + result[iknl, itgt2] = result[iknl, itgt2] + result_temp[ \ + itgt_offset, iknl] * kernel_scaling \ + {dep=update_result:init_result,id=write_result, \ + dup=iknl,if=run_itgt2} end end """], @@ -306,28 +364,48 @@ def get_kernel(self): fixed_parameters={ "ncoeffs": ncoeffs, "dim": self.dim, + "max_work_items": max_work_items, + "max_ntargets_in_one_box": max_ntargets_in_one_box, "nresults": len(self.kernels)}, lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_inames(loopy_knl, "iknl*:unr") - loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box") + loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,isrc_box,itgt_offset") loopy_knl = self.add_loopy_eval_callable(loopy_knl) loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") return loopy_knl - def get_optimized_kernel(self): - # FIXME - knl = self.get_kernel() - knl = lp.tag_inames(knl, {"itgt_box": "g.0"}) + def get_optimized_kernel(self, max_ntargets_in_one_box): + _, optimizations = self.get_cached_loopy_knl_and_optimizations() + knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) + knl = lp.tag_inames(knl, {"itgt_box": "g.0", "dummy": "l.0"}) + knl = lp.unprivatize_temporaries_with_inames(knl, + "itgt_offset", "result_temp") + knl = lp.split_iname(knl, "itgt_offset", 32, inner_tag="l.0") + knl = lp.split_iname(knl, "icoeff", 32, inner_tag="l.0") + knl = lp.privatize_temporaries_with_inames(knl, + "itgt_offset_outer", "result_temp") + knl = lp.duplicate_inames(knl, "itgt_offset_outer", "id:init_result") + knl = lp.duplicate_inames(knl, "itgt_offset_outer", + "id:write_result or id:init_itgt_for_write or id:init_cond_for_write") + knl = lp.add_inames_to_insn(knl, "dummy", + "id:init_box* or id:fetch_src_box or id:fetch_center " + "or id:kernel_scaling") knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling") + knl = lp.add_inames_to_insn(knl, "itgt_offset_inner", "id:fetch_init*") + knl = lp.set_temporary_address_space(knl, "coeffs", lp.AddressSpace.LOCAL) knl = lp.set_options(knl, - enforce_variable_access_ordered="no_check") + enforce_variable_access_ordered="no_check", write_code=False) + for transform in optimizations: + knl = transform(knl) return knl def __call__(self, queue, **kwargs): - knl = self.get_cached_kernel_executor() + max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box") + knl = self.get_cached_kernel_executor( + max_ntargets_in_one_box=max_ntargets_in_one_box) centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py index dd0838498..701f03a30 100644 --- a/sumpy/expansion/local.py +++ b/sumpy/expansion/local.py @@ -405,6 +405,14 @@ def loopy_translate_from(self, src_expansion): f"A direct loopy kernel for translation from " f"{src_expansion} to {self} is not implemented.") + def loopy_evaluate(self, kernels): + from sumpy.expansion.loopy import (make_l2p_loopy_kernel_for_volume_taylor, + make_e2p_loopy_kernel) + try: + return make_l2p_loopy_kernel_for_volume_taylor(self, kernels) + except NotImplementedError: + return make_e2p_loopy_kernel(self, kernels) + class VolumeTaylorLocalExpansion( VolumeTaylorExpansion, diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index 00bbd3dbe..eae414add 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -22,6 +22,7 @@ from typing import Sequence import pymbolic +import pymbolic.primitives as prim import loopy as lp import numpy as np from sumpy.expansion import ExpansionBase @@ -29,6 +30,7 @@ import sumpy.symbolic as sym from sumpy.assignment_collection import SymbolicAssignmentCollection from sumpy.tools import gather_loopy_arguments, gather_loopy_source_arguments +from math import prod, gcd import logging logger = logging.getLogger(__name__) @@ -126,7 +128,9 @@ def make_e2p_loopy_kernel( for kernel in kernels: loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) - return loopy_knl + optimizations = [] + + return loopy_knl, optimizations def make_p2e_loopy_kernel( @@ -225,3 +229,330 @@ def make_p2e_loopy_kernel( loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) return loopy_knl + + +def make_l2p_loopy_kernel_for_volume_taylor(expansion, kernels): + dim = expansion.dim + order = expansion.order + ncoeffs = len(expansion) + + code_transformers = [expansion.get_code_transformer()] \ + + [kernel.get_code_transformer() for kernel in kernels] + pymbolic_conv = sym.SympyToPymbolicMapper() + + max_deriv_order = 0 + sym_expr_dicts = [] + for kernel in kernels: + expr_dict = {(0,)*dim: 1} + expr_dict = kernel.get_derivative_coeff_dict_at_target(expr_dict) + max_deriv_order = max(max_deriv_order, max(sum(mi) for mi in expr_dict)) + sym_expr_dict = {} + for mi, coeff in expr_dict.items(): + coeff = pymbolic_conv(coeff) + for transform in code_transformers: + coeff = transform(coeff) + sym_expr_dict[mi] = coeff + sym_expr_dicts.append(sym_expr_dict) + + domains = [ + "{[idim]: 0<=idim=", c)]), + )) + + if c != sync_split: + # We now copy before synchronization + v = [pymbolic.var(f"z{i}") for i in range(dim)] + v[slowest_axis], v[0] = v[0], v[slowest_axis] + iorder = pymbolic.var("iorder3") + idx = get_idx(v) + domains += get_domains(v, iorder, with_sync=True)[2:] + + insns.append(lp.Assignment( + assignee=coeffs_copy[0, idx], + expression=coeffs_copy[1, idx], + id="copy_sync", + depends_on=frozenset(["update_coeffs"]), + depends_on_is_final=True, + predicates=frozenset([prim.Comparison(v[0], ">=", c)]), + )) + + v = [pymbolic.var(f"y{i}") for i in range(dim)] + v[slowest_axis], v[0] = v[0], v[slowest_axis] + iorder = pymbolic.var("iorder2") + idx = get_idx(v) + domains += get_domains(v, iorder, with_sync=False)[1:] + + if c == sync_split: + # We did not have to sync within the c rows. + # We last wrote to coeffs_copy[v[0]//c % 2, :] and we read from it. + fetch_idx = (v[0]//c) % 2 + else: + # We need to sync within the c rows. + # We last wrote to coeffs_copy[0, :] and we read from it. + fetch_idx = 0 + + for ikernel, expr_dict in enumerate(sym_expr_dicts): + expr = sum(coeff * prod(powers[i, + v[i] + max_deriv_order - mi[i]] for i in range(dim)) + * (1 / rscale ** sum(mi)) + for mi, coeff in expr_dict.items()) + + insn = lp.Assignment( + assignee=result[ikernel], + expression=(result[ikernel] + + coeffs_copy[fetch_idx, idx] * expr), + id=f"write_{ikernel}", + depends_on=frozenset(["update_monomials", + "update_coeffs" if c == sync_split else "copy_sync"]), + depends_on_is_final=True, + ) + insns.append(insn) + + tags = { + "e2p_iorder1": "l.0", + f"e2p_{x0}_outer": "unr", + f"e2p_{x0}_inner": "unr", + f"e2p_{v[0]}_inner": "unr", + "e2p_iorder2": "unr", + } + if c != sync_split: + tags["e2p_iorder3"] = "l.0" + + optimizations += [ + lambda knl: lp.tag_inames(knl, tags), + lambda knl: lp.set_temporary_address_space(knl, "e2p_coeffs_copy", + lp.AddressSpace.LOCAL), + lambda knl: lp.split_iname(knl, "e2p_icoeff", 32, inner_tag="l.0"), + ] + + target_args = gather_loopy_arguments((expansion,) + tuple(kernels)) + loopy_knl = lp.make_function(domains, insns, + kernel_data=[ + lp.GlobalArg("result", shape=(len(kernels),), is_input=True, + is_output=True), + lp.GlobalArg("coeffs", + shape=(ncoeffs,), is_input=True, is_output=False), + lp.GlobalArg("center, target", + shape=(dim,), is_input=True, is_output=False), + lp.ValueArg("rscale", is_input=True), + lp.ValueArg("itgt", is_input=True), + lp.ValueArg("ntargets", is_input=True), + lp.GlobalArg("targets", + shape=(dim, "ntargets"), is_input=True, is_output=False), + *target_args, + *temporary_variables, + ...], + name="e2p", + lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, + fixed_parameters={ + "dim": dim, + "nresults": len(kernels), + "order": order, + "max_deriv_order": max_deriv_order, + "ncoeffs": ncoeffs, + }, + ) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for kernel in kernels: + loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) + + return loopy_knl, optimizations diff --git a/sumpy/fmm.py b/sumpy/fmm.py index de7ac6aec..3c6a4ca20 100644 --- a/sumpy/fmm.py +++ b/sumpy/fmm.py @@ -933,6 +933,7 @@ def eval_multipoles(self, result=pot, rscale=self.level_to_rscale(isrc_level), + max_ntargets_in_one_box=self.max_ntargets_in_one_box, wait_for=wait_for, @@ -1069,6 +1070,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): result=pot, rscale=self.level_to_rscale(lev), + max_ntargets_in_one_box=self.max_ntargets_in_one_box, **kwargs) events.append(evt) diff --git a/sumpy/symbolic.py b/sumpy/symbolic.py index 48d43a5ef..99706c9b7 100644 --- a/sumpy/symbolic.py +++ b/sumpy/symbolic.py @@ -120,6 +120,7 @@ def _find_symbolic_backend(): functions = sym.functions Number = sym.Number Float = sym.Float +diff = sym.diff def _coeff_isneg(a): diff --git a/sumpy/toys.py b/sumpy/toys.py index 9fa67cbc2..ec3f88479 100644 --- a/sumpy/toys.py +++ b/sumpy/toys.py @@ -250,6 +250,7 @@ def _e2p(psource, targets, e2p): target_boxes=boxes, box_target_starts=box_target_starts, box_target_counts_nonchild=box_target_counts_nonchild, + max_ntargets_in_one_box=ntargets, centers=centers, rscale=psource.rscale, targets=targets, diff --git a/test/test_kernels.py b/test/test_kernels.py index bf03f00d4..f8c280b06 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -367,6 +367,7 @@ def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative target_boxes=source_boxes, box_target_starts=box_target_starts, box_target_counts_nonchild=box_target_counts_nonchild, + max_ntargets_in_one_box=ntargets, centers=centers, targets=targets, rscale=rscale, @@ -558,6 +559,7 @@ def eval_at(e2p, source_box_nr, rscale): target_boxes=e2p_target_boxes, box_target_starts=e2p_box_target_starts, box_target_counts_nonchild=e2p_box_target_counts_nonchild, + max_ntargets_in_one_box=ntargets, centers=centers, targets=targets, From 0e81ebf49a1635dc2eea1029ec794bf7f9740227 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 28 May 2023 00:31:23 -0500 Subject: [PATCH 03/12] use new method name and 32->256 --- sumpy/e2p.py | 12 ++++++------ sumpy/expansion/local.py | 7 ++++++- sumpy/expansion/loopy.py | 4 +++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 2801f8924..7f331389c 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -126,7 +126,7 @@ def default_name(self): def get_kernel(self, max_ntargets_in_one_box): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() - max_work_items = min(32, max(ncoeffs, max_ntargets_in_one_box)) + max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box)) loopy_knl = lp.make_kernel( [ @@ -211,8 +211,8 @@ def get_optimized_kernel(self, max_ntargets_in_one_box): inner_knl, optimizations = self.get_cached_loopy_knl_and_optimizations() knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) knl = lp.tag_inames(knl, {"itgt_box": "g.0"}) - knl = lp.split_iname(knl, "itgt_offset", 32, inner_tag="l.0") - knl = lp.split_iname(knl, "icoeff", 32, inner_tag="l.0") + knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0") + knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0") knl = lp.add_inames_to_insn(knl, "dummy", "id:fetch_init* or id:fetch_center or id:kernel_scaling") knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling") @@ -276,7 +276,7 @@ def default_name(self): def get_kernel(self, max_ntargets_in_one_box): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() - max_work_items = min(32, max(ncoeffs, max_ntargets_in_one_box)) + max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box)) loopy_knl = lp.make_kernel( [ @@ -383,8 +383,8 @@ def get_optimized_kernel(self, max_ntargets_in_one_box): knl = lp.tag_inames(knl, {"itgt_box": "g.0", "dummy": "l.0"}) knl = lp.unprivatize_temporaries_with_inames(knl, "itgt_offset", "result_temp") - knl = lp.split_iname(knl, "itgt_offset", 32, inner_tag="l.0") - knl = lp.split_iname(knl, "icoeff", 32, inner_tag="l.0") + knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0") + knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0") knl = lp.privatize_temporaries_with_inames(knl, "itgt_offset_outer", "result_temp") knl = lp.duplicate_inames(knl, "itgt_offset_outer", "id:init_result") diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py index 701f03a30..ec92c96ae 100644 --- a/sumpy/expansion/local.py +++ b/sumpy/expansion/local.py @@ -32,6 +32,11 @@ VolumeTaylorExpansionMixin, LinearPDEConformingVolumeTaylorExpansion) from sumpy.tools import add_to_sac, mi_increment_axis +from sumpy.kernel import Kernel + +import loopy as lp + +from typing import Sequence import logging logger = logging.getLogger(__name__) @@ -405,7 +410,7 @@ def loopy_translate_from(self, src_expansion): f"A direct loopy kernel for translation from " f"{src_expansion} to {self} is not implemented.") - def loopy_evaluate(self, kernels): + def get_loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit: from sumpy.expansion.loopy import (make_l2p_loopy_kernel_for_volume_taylor, make_e2p_loopy_kernel) try: diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index eae414add..9122bacd3 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -515,11 +515,13 @@ def get_idx(v): if c != sync_split: tags["e2p_iorder3"] = "l.0" + nsplit = min(256, ncoeffs) + optimizations += [ lambda knl: lp.tag_inames(knl, tags), lambda knl: lp.set_temporary_address_space(knl, "e2p_coeffs_copy", lp.AddressSpace.LOCAL), - lambda knl: lp.split_iname(knl, "e2p_icoeff", 32, inner_tag="l.0"), + lambda knl: lp.split_iname(knl, "e2p_icoeff", nsplit, inner_tag="l.0"), ] target_args = gather_loopy_arguments((expansion,) + tuple(kernels)) From 19531d09cbf807c6b1583db3bf72f010e6e9bf6b Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 2 Jun 2023 18:35:15 -0500 Subject: [PATCH 04/12] get_loopy_evaluator -> loopy_evaluator_and_optimizations --- sumpy/e2p.py | 10 +++++----- sumpy/expansion/__init__.py | 11 +++++++---- sumpy/expansion/local.py | 6 ++++-- sumpy/expansion/loopy.py | 7 ++++--- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 7f331389c..c10b0e10c 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -85,15 +85,15 @@ def default_name(self): pass @memoize_method - def get_cached_loopy_knl_and_optimizations(self): - return self.expansion.loopy_evaluator(self.kernels) + def get_loopy_evaluator_and_optimizations(self): + return self.expansion.loopy_evaluator_and_optimizations(self.kernels) def get_cache_key(self): return (type(self).__name__, self.expansion, tuple(self.kernels)) def add_loopy_eval_callable( self, loopy_knl: lp.TranslationUnit) -> lp.TranslationUnit: - inner_knl, _ = self.get_cached_loopy_knl_and_optimizations() + inner_knl, _ = self.get_loopy_evaluator_and_optimizations() loopy_knl = lp.merge([loopy_knl, inner_knl]) loopy_knl = lp.inline_callable_kernel(loopy_knl, "e2p") for kernel in self.kernels: @@ -208,7 +208,7 @@ def get_kernel(self, max_ntargets_in_one_box): return loopy_knl def get_optimized_kernel(self, max_ntargets_in_one_box): - inner_knl, optimizations = self.get_cached_loopy_knl_and_optimizations() + inner_knl, optimizations = self.get_loopy_evaluator_and_optimizations() knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) knl = lp.tag_inames(knl, {"itgt_box": "g.0"}) knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0") @@ -378,7 +378,7 @@ def get_kernel(self, max_ntargets_in_one_box): return loopy_knl def get_optimized_kernel(self, max_ntargets_in_one_box): - _, optimizations = self.get_cached_loopy_knl_and_optimizations() + _, optimizations = self.get_loopy_evaluator_and_optimizations() knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) knl = lp.tag_inames(knl, {"itgt_box": "g.0", "dummy": "l.0"}) knl = lp.unprivatize_temporaries_with_inames(knl, diff --git a/sumpy/expansion/__init__.py b/sumpy/expansion/__init__.py index d72918eb5..8f1f6b014 100644 --- a/sumpy/expansion/__init__.py +++ b/sumpy/expansion/__init__.py @@ -22,7 +22,8 @@ from abc import ABC, abstractmethod from typing import ( - Any, ClassVar, Dict, Hashable, List, Optional, Sequence, Tuple, Type) + Any, ClassVar, Dict, Hashable, List, Optional, Sequence, Tuple, Type, + Callable) from pytools import memoize_method import loopy as lp @@ -66,9 +67,9 @@ class ExpansionBase(ABC): .. automethod:: get_coefficient_identifiers .. automethod:: coefficients_from_source .. automethod:: coefficients_from_source_vec - .. automethod:: loopy_expansion_formation .. automethod:: evaluate - .. automethod:: loopy_evaluator + .. automethod:: loopy_expansion_formation + .. automethod:: loopy_evaluator_and_optimizations .. automethod:: with_kernel .. automethod:: copy @@ -183,7 +184,9 @@ def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): in *coeffs*. """ - def loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit: + def loopy_evaluator_and_optimizations(self, kernels: Sequence[Kernel]) \ + -> Tuple[lp.TranslationUnit, Sequence[ + Callable[[lp.TranslationUnit], lp.TranslationUnit]]]: """ :returns: a :mod:`loopy` kernel that returns the evaluated target transforms of the potential given by *kernels*. diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py index ec92c96ae..3aed919e3 100644 --- a/sumpy/expansion/local.py +++ b/sumpy/expansion/local.py @@ -36,7 +36,7 @@ import loopy as lp -from typing import Sequence +from typing import Sequence, Callable, Tuple import logging logger = logging.getLogger(__name__) @@ -410,7 +410,9 @@ def loopy_translate_from(self, src_expansion): f"A direct loopy kernel for translation from " f"{src_expansion} to {self} is not implemented.") - def get_loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit: + def loopy_evaluator_and_optimizations(self, kernels: Sequence[Kernel]) \ + -> Tuple[lp.TranslationUnit, Sequence[ + Callable[[lp.TranslationUnit], lp.TranslationUnit]]]: from sumpy.expansion.loopy import (make_l2p_loopy_kernel_for_volume_taylor, make_e2p_loopy_kernel) try: diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index 9122bacd3..d9fa8d718 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -20,7 +20,7 @@ THE SOFTWARE. """ -from typing import Sequence +from typing import Sequence, Callable, Tuple import pymbolic import pymbolic.primitives as prim import loopy as lp @@ -36,8 +36,9 @@ logger = logging.getLogger(__name__) -def make_e2p_loopy_kernel( - expansion: ExpansionBase, kernels: Sequence[Kernel]) -> lp.TranslationUnit: +def make_e2p_loopy_kernel(expansion: ExpansionBase, kernels: Sequence[Kernel]) \ + -> Tuple[lp.TranslationUnit, Sequence[ + Callable[[lp.TranslationUnit], lp.TranslationUnit]]]: """ This is a helper function to create a loopy kernel for multipole/local evaluation. This function uses symbolic expressions given by the expansion class, From dcac349a1eef533c734cf1dfba30c0de935509b8 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 27 Jul 2023 16:50:38 -0500 Subject: [PATCH 05/12] Optimize M2P --- sumpy/e2p.py | 29 +-- sumpy/expansion/loopy.py | 331 ++++++++++++++++++++++++++++++++++- sumpy/expansion/multipole.py | 17 ++ 3 files changed, 365 insertions(+), 12 deletions(-) diff --git a/sumpy/e2p.py b/sumpy/e2p.py index c10b0e10c..96f252e95 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -123,10 +123,9 @@ class E2PFromSingleBox(E2PBase): def default_name(self): return "e2p_from_single_box" - def get_kernel(self, max_ntargets_in_one_box): + def get_kernel(self, max_ntargets_in_one_box, max_work_items): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() - max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box)) loopy_knl = lp.make_kernel( [ @@ -208,11 +207,16 @@ def get_kernel(self, max_ntargets_in_one_box): return loopy_knl def get_optimized_kernel(self, max_ntargets_in_one_box): - inner_knl, optimizations = self.get_loopy_evaluator_and_optimizations() - knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) + _, optimizations = self.get_loopy_evaluator_and_optimizations() + + ncoeffs = len(self.expansion) + max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box)) + knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box, + max_work_items=max_work_items) + knl = lp.tag_inames(knl, {"itgt_box": "g.0"}) - knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0") - knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0") + knl = lp.split_iname(knl, "itgt_offset", max_work_items, inner_tag="l.0") + knl = lp.split_iname(knl, "icoeff", max_work_items, inner_tag="l.0") knl = lp.add_inames_to_insn(knl, "dummy", "id:fetch_init* or id:fetch_center or id:kernel_scaling") knl = lp.add_inames_to_insn(knl, "itgt_box", "id:kernel_scaling") @@ -273,10 +277,9 @@ class E2PFromCSR(E2PBase): def default_name(self): return "e2p_from_csr" - def get_kernel(self, max_ntargets_in_one_box): + def get_kernel(self, max_ntargets_in_one_box, max_work_items): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() - max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box)) loopy_knl = lp.make_kernel( [ @@ -379,12 +382,16 @@ def get_kernel(self, max_ntargets_in_one_box): def get_optimized_kernel(self, max_ntargets_in_one_box): _, optimizations = self.get_loopy_evaluator_and_optimizations() - knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box) + ncoeffs = len(self.expansion) + max_work_items = min(256, max(ncoeffs, max_ntargets_in_one_box)) + + knl = self.get_kernel(max_ntargets_in_one_box=max_ntargets_in_one_box, + max_work_items=max_work_items) knl = lp.tag_inames(knl, {"itgt_box": "g.0", "dummy": "l.0"}) knl = lp.unprivatize_temporaries_with_inames(knl, "itgt_offset", "result_temp") - knl = lp.split_iname(knl, "itgt_offset", 256, inner_tag="l.0") - knl = lp.split_iname(knl, "icoeff", 256, inner_tag="l.0") + knl = lp.split_iname(knl, "itgt_offset", max_work_items, inner_tag="l.0") + knl = lp.split_iname(knl, "icoeff", max_work_items, inner_tag="l.0") knl = lp.privatize_temporaries_with_inames(knl, "itgt_offset_outer", "result_temp") knl = lp.duplicate_inames(knl, "itgt_offset_outer", "id:init_result") diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index d9fa8d718..2b7a2b8cf 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -26,7 +26,7 @@ import loopy as lp import numpy as np from sumpy.expansion import ExpansionBase -from sumpy.kernel import Kernel +from sumpy.kernel import Kernel, LaplaceKernel import sumpy.symbolic as sym from sumpy.assignment_collection import SymbolicAssignmentCollection from sumpy.tools import gather_loopy_arguments, gather_loopy_source_arguments @@ -134,6 +134,335 @@ def make_e2p_loopy_kernel(expansion: ExpansionBase, kernels: Sequence[Kernel]) \ return loopy_knl, optimizations +def make_m2p_loopy_kernel_for_volume_taylor( + expansion: ExpansionBase, kernels: Sequence[Kernel]) -> lp.TranslationUnit: + + dim = expansion.dim + if dim == 3: + return make_m2p_loopy_kernel_for_volume_taylor_3d(expansion, kernels) + elif dim == 2: + return make_m2p_loopy_kernel_for_volume_taylor_2d(expansion, kernels) + else: + raise NotImplementedError() + + +def make_m2p_loopy_kernel_for_volume_taylor_3d( + expansion: ExpansionBase, kernels: Sequence[Kernel]) -> lp.TranslationUnit: + + if len(kernels) > 1: + raise NotImplementedError() + + kernel = kernels[0] + if not isinstance(kernel, LaplaceKernel): + raise NotImplementedError() + + dim = expansion.dim + order = expansion.order + + if order < 2: + raise NotImplementedError() + + b = pymbolic.var("b") + ncoeffs = len(expansion.get_coefficient_identifiers()) + + temp = pymbolic.var("temp") + inv_r = pymbolic.var("inv_r") + inv_r2 = pymbolic.var("inv_r2") + + domains = [ + "{[idim]: 0<=idim=", 1), + 2*x0*b[0]*temp[x0 - 1, x1, x2], 0) + expr += prim.If(prim.Comparison(x1, ">=", 1), + 2*x1*b[1]*temp[x0, x1 - 1, x2], 0) + expr *= -inv_r2 + insns += [ + lp.Assignment( + id=f"temp_{x0}_{x1}_{x2}", + assignee=temp[x0, x1, x2], + expression=expr, + depends_on=depends_on, + predicates=frozenset([prim.Comparison(x2, ">=", 2)]), + ), + lp.Assignment( + id=f"update_{x0}_{x1}_{x2}", + assignee=result[0], + expression=(result[0] + coeffs[wrangler.get_storage_index([x0, x1, x2])] + * temp[x0, x1, x2]), + depends_on=frozenset([f"temp_{x0}_{x1}_{x2}"]) + ) + ] + depends_on = frozenset([f"update_{x0}_{x1}_{x2}"]) + + # For x1 >=2 + x2 = pymbolic.var("y2") + x1 = pymbolic.var("y1") + x0 = pymbolic.var("y0") + domains += [ + f"{{[{x1}]: 2<={x1}<=order}}", + f"{{[{x0}]: 0<={x0}<=1 and {x0}=", 1), + 2*x0*b[0]*temp[x0 - 1, x1 % 2, x2], 0) + expr += prim.If(prim.Comparison(x2, ">=", 1), + 2*x2*b[2]*temp[x0, x1 % 2, x2 - 1], 0) + expr += prim.If(prim.Comparison(x2, ">=", 2), + 2*b[2]*temp[x0, x1 % 2, x2 - 1], 0) + expr *= -inv_r2 + insns += [ + lp.Assignment( + id=f"temp_{x0}_{x1}_{x2}", + assignee=temp[x0, x1 % 2, x2], + expression=expr, + depends_on=depends_on, + ), + lp.Assignment( + id=f"update_{x0}_{x1}_{x2}", + assignee=result[0], + expression=(result[0] + coeffs[wrangler.get_storage_index([x0, x1, x2])] + * temp[x0, x1 % 2, x2]), + depends_on=frozenset([f"temp_{x0}_{x1}_{x2}"]) + ) + ] + depends_on = frozenset([f"update_{x0}_{x1}_{x2}"]) + + loopy_knl = lp.make_function(domains, insns, + kernel_data=[ + lp.GlobalArg("result", shape=(len(kernels),), is_input=True, + is_output=True), + lp.GlobalArg("coeffs", + shape=(ncoeffs,), is_input=True, is_output=False), + lp.GlobalArg("center, target", + shape=(dim,), is_input=True, is_output=False), + lp.ValueArg("rscale", is_input=True), + lp.ValueArg("itgt", is_input=True), + lp.ValueArg("ntargets", is_input=True), + lp.GlobalArg("targets", + shape=(dim, "ntargets"), is_input=True, is_output=False), + lp.TemporaryVariable("temp"), + ...], + name="e2p", + lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, + fixed_parameters={"dim": dim, "order": order}, + ) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.tag_inames(loopy_knl, "x0*:unr") + loopy_knl = lp.tag_inames(loopy_knl, "x1*:unr") + loopy_knl = lp.tag_inames(loopy_knl, "y0*:unr") + for kernel in kernels: + loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) + + loopy_knl = lp.simplify_indices(loopy_knl) + + optimizations = [] + + return loopy_knl, optimizations + + +def make_m2p_loopy_kernel_for_volume_taylor_2d( + expansion: ExpansionBase, kernels: Sequence[Kernel]) -> lp.TranslationUnit: + + if len(kernels) > 1: + raise NotImplementedError() + + kernel = kernels[0] + if not isinstance(kernel, LaplaceKernel): + raise NotImplementedError() + + dim = expansion.dim + order = expansion.order + + if order < 2: + raise NotImplementedError() + + b = pymbolic.var("b") + ncoeffs = len(expansion.get_coefficient_identifiers()) + + temp = pymbolic.var("temp") + r = pymbolic.var("r") + inv_r2 = pymbolic.var("inv_r2") + log = pymbolic.var("log") + + domains = [ + "{[idim]: 0<=idim=", 2)]), + ), + lp.Assignment( + id=f"update_{x0}_{x1}", + assignee=result[0], + expression=(result[0] + coeffs[wrangler.get_storage_index([x0, x1])] + * temp[x0, x1]), + depends_on=frozenset([f"temp_{x0}_{x1}"]) + ) + ] + depends_on = frozenset([f"update_{x0}_{x1}"]) + + loopy_knl = lp.make_function(domains, insns, + kernel_data=[ + lp.GlobalArg("result", shape=(len(kernels),), is_input=True, + is_output=True), + lp.GlobalArg("coeffs", + shape=(ncoeffs,), is_input=True, is_output=False), + lp.GlobalArg("center, target", + shape=(dim,), is_input=True, is_output=False), + lp.ValueArg("rscale", is_input=True), + lp.ValueArg("itgt", is_input=True), + lp.ValueArg("ntargets", is_input=True), + lp.GlobalArg("targets", + shape=(dim, "ntargets"), is_input=True, is_output=False), + lp.TemporaryVariable("temp"), + ...], + name="e2p", + lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, + fixed_parameters={"dim": dim, "order": order}, + ) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.tag_inames(loopy_knl, "x0*:unr") + for kernel in kernels: + loopy_knl = kernel.prepare_loopy_kernel(loopy_knl) + + loopy_knl = lp.simplify_indices(loopy_knl) + + optimizations = [] + + return loopy_knl, optimizations + + def make_p2e_loopy_kernel( expansion: ExpansionBase, kernels: Sequence[Kernel], strength_usage: Sequence[int], nstrengths: int) -> lp.TranslationUnit: diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py index 4ceaa0a58..a74c84e39 100644 --- a/sumpy/expansion/multipole.py +++ b/sumpy/expansion/multipole.py @@ -30,6 +30,11 @@ VolumeTaylorExpansionMixin, LinearPDEConformingVolumeTaylorExpansion) from sumpy.tools import mi_set_axis, add_to_sac, mi_power, mi_factorial +from sumpy.kernel import Kernel + +import loopy as lp + +from typing import Sequence import logging logger = logging.getLogger(__name__) @@ -103,6 +108,18 @@ def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): result = sym.Add(*tuple(result)) return result + def get_loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit: + """ + :returns: a :mod:`loopy` kernel that returns the evaluated + target transforms of the potential given by *kernels*. + """ + from sumpy.expansion.loopy import (make_e2p_loopy_kernel, + make_m2p_loopy_kernel_for_volume_taylor) + try: + return make_m2p_loopy_kernel_for_volume_taylor(self, kernels) + except NotImplementedError: + return make_e2p_loopy_kernel(self, kernels) + def translate_from(self, src_expansion, src_coeff_exprs, src_rscale, dvec, tgt_rscale, sac=None, _fast_version=True): if not isinstance(src_expansion, type(self)): From 3bee51e614777b26bd4d469db28986946aed6b64 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 27 Jul 2023 23:00:06 -0500 Subject: [PATCH 06/12] Fix for refactor --- sumpy/expansion/multipole.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py index a74c84e39..d15af1707 100644 --- a/sumpy/expansion/multipole.py +++ b/sumpy/expansion/multipole.py @@ -108,7 +108,9 @@ def evaluate(self, kernel, coeffs, bvec, rscale, sac=None): result = sym.Add(*tuple(result)) return result - def get_loopy_evaluator(self, kernels: Sequence[Kernel]) -> lp.TranslationUnit: + def loopy_evaluator_and_optimizations(self, kernels: Sequence[Kernel]) \ + -> Tuple[lp.TranslationUnit, Sequence[ + Callable[[lp.TranslationUnit], lp.TranslationUnit]]]: """ :returns: a :mod:`loopy` kernel that returns the evaluated target transforms of the potential given by *kernels*. From 00673d25ef85faf929371d697e1d90188a2f40f7 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 27 Jul 2023 23:30:40 -0500 Subject: [PATCH 07/12] import missing types --- sumpy/expansion/multipole.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py index d15af1707..c03ac5c9f 100644 --- a/sumpy/expansion/multipole.py +++ b/sumpy/expansion/multipole.py @@ -34,7 +34,7 @@ import loopy as lp -from typing import Sequence +from typing import Sequence, Tuple, Callable import logging logger = logging.getLogger(__name__) From 789c1d0ea3e3aa1a57175693a22b8bedc32336e1 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 27 Jul 2023 23:49:15 -0500 Subject: [PATCH 08/12] fix typo --- sumpy/expansion/loopy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index 2b7a2b8cf..3da50ec77 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -234,7 +234,7 @@ def make_m2p_loopy_kernel_for_volume_taylor_3d( x0 = pymbolic.var("x0") domains += [ f"{{[{x1}]: 0<={x1}<=1}}", - f"{{[{x0}]: 0<={x0}<=1 and {x0} Date: Fri, 28 Jul 2023 01:38:15 -0500 Subject: [PATCH 09/12] Fix 2D test case --- sumpy/expansion/loopy.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index 3da50ec77..dbf95be10 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -378,19 +378,15 @@ def make_m2p_loopy_kernel_for_volume_taylor_2d( ] init_exprs = {(0, 0): log(r)} - - # Order 1 expressions - for i in range(dim): - mi = [0]*dim - mi[i] = 1 - init_exprs[tuple(mi)] = inv_r2 * b[i] - - # Order 2 expressions + init_exprs[(1, 0)] = inv_r2 * b[0] + init_exprs[(0, 1)] = inv_r2 * b[1] init_exprs[(1, 1)] = -2*inv_r2*inv_r2*b[0]*b[1] + init_exprs[(0, 2)] = inv_r2*inv_r2*(b[0]*b[0] - b[1]*b[1]) + init_exprs[(1, 2)] = inv_r2*inv_r2*inv_r2*(2*b[0]*(3*b[1]*b[1] - b[0]*b[0])) depends_on = frozenset(["inv_r2"]) for i in range(2): - for j in range(2): + for j in range(3): insn = lp.Assignment( id=f"init_{i}_{j}", assignee=temp[i, j], @@ -410,8 +406,8 @@ def make_m2p_loopy_kernel_for_volume_taylor_2d( f"{{[{x0}]: 0<={x0}<=1}}", f"{{[{x1}]: 0<={x1}<=order-{x0} }}", ] - expr = 2*(x1 + x0 - 1) * b[0] * temp[x0 - 1, x1] + (x1 + x0 - 1)*(x1 + x0 - 2) \ - * temp[x0 - 2, x1] + expr = 2*(x1 + x0 - 1) * b[1] * temp[x0, x1 - 1] + (x1 + x0 - 1)*(x1 + x0 - 2) \ + * temp[x0, x1 - 2] expr *= -inv_r2 insns += [ lp.Assignment( @@ -419,7 +415,7 @@ def make_m2p_loopy_kernel_for_volume_taylor_2d( assignee=temp[x0, x1], expression=expr, depends_on=depends_on, - predicates=frozenset([prim.Comparison(x1, ">=", 2)]), + predicates=frozenset([prim.Comparison(x1, ">=", 3)]), ), lp.Assignment( id=f"update_{x0}_{x1}", From 4bb8f0d294adf7ed65b72ccea97b3170432d772d Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 28 Jul 2023 01:41:04 -0500 Subject: [PATCH 10/12] skip non linear PDE conforming --- sumpy/expansion/loopy.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index dbf95be10..d542e3b22 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -25,7 +25,7 @@ import pymbolic.primitives as prim import loopy as lp import numpy as np -from sumpy.expansion import ExpansionBase +from sumpy.expansion import ExpansionBase, LinearPDEConformingVolumeTaylorExpansion from sumpy.kernel import Kernel, LaplaceKernel import sumpy.symbolic as sym from sumpy.assignment_collection import SymbolicAssignmentCollection @@ -156,6 +156,9 @@ def make_m2p_loopy_kernel_for_volume_taylor_3d( if not isinstance(kernel, LaplaceKernel): raise NotImplementedError() + if not isinstance(expansion, LinearPDEConformingVolumeTaylorExpansion): + raise NotImplementedError() + dim = expansion.dim order = expansion.order @@ -341,6 +344,9 @@ def make_m2p_loopy_kernel_for_volume_taylor_2d( if not isinstance(kernel, LaplaceKernel): raise NotImplementedError() + if not isinstance(expansion, LinearPDEConformingVolumeTaylorExpansion): + raise NotImplementedError() + dim = expansion.dim order = expansion.order From 863902058b7479077bd487c91ddbb6812457a2a6 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 28 Jul 2023 11:18:27 -0500 Subject: [PATCH 11/12] Fix log(r) for Laplace 2D --- sumpy/expansion/loopy.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index d542e3b22..03e305530 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -358,6 +358,8 @@ def make_m2p_loopy_kernel_for_volume_taylor_2d( temp = pymbolic.var("temp") r = pymbolic.var("r") + rscale = pymbolic.var("rscale") + no_scale_r = pymbolic.var("no_scale_r") inv_r2 = pymbolic.var("inv_r2") log = pymbolic.var("log") @@ -375,6 +377,11 @@ def make_m2p_loopy_kernel_for_volume_taylor_2d( expression=pymbolic.var("sqrt")(sum(b[i]*b[i] for i in range(dim))), temp_var_type=lp.Optional(None), ), + lp.Assignment( + assignee=no_scale_r, + expression=r * rscale, + temp_var_type=lp.Optional(None), + ), lp.Assignment( id="inv_r2", assignee=inv_r2, @@ -383,7 +390,7 @@ def make_m2p_loopy_kernel_for_volume_taylor_2d( ) ] - init_exprs = {(0, 0): log(r)} + init_exprs = {(0, 0): log(no_scale_r)} init_exprs[(1, 0)] = inv_r2 * b[0] init_exprs[(0, 1)] = inv_r2 * b[1] init_exprs[(1, 1)] = -2*inv_r2*inv_r2*b[0]*b[1] From 6224035277d0efd8d56cd764879fe4c9dc2fe3ea Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sat, 29 Jul 2023 01:27:03 -0500 Subject: [PATCH 12/12] Fix typos --- sumpy/expansion/loopy.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sumpy/expansion/loopy.py b/sumpy/expansion/loopy.py index 03e305530..b53d36d0a 100644 --- a/sumpy/expansion/loopy.py +++ b/sumpy/expansion/loopy.py @@ -171,6 +171,7 @@ def make_m2p_loopy_kernel_for_volume_taylor_3d( temp = pymbolic.var("temp") inv_r = pymbolic.var("inv_r") inv_r2 = pymbolic.var("inv_r2") + rscale = pymbolic.var("rscale") domains = [ "{[idim]: 0<=idim=", 1), 2*x2*b[2]*temp[x0, x1 % 2, x2 - 1], 0) expr += prim.If(prim.Comparison(x2, ">=", 2), - 2*b[2]*temp[x0, x1 % 2, x2 - 1], 0) + x2*(x2-1)*temp[x0, x1 % 2, x2 - 2], 0) expr *= -inv_r2 insns += [ lp.Assignment(